In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [9]:
#Import the data set and extract the variables
retailStore = pd.read_csv('Store_CA.csv') #read the csv file into pandas dataframe
#take every row (:) and every column (:) and set the values in there(.values)
X = retailStore.iloc[:, :].values
#take every row (:) including the last column with index of(11)
y = retailStore.iloc[:, 11].values

#print(retailStore)#print all rows and columns in the file
retailStore.head()#print the first 5 rows of data

Unnamed: 0,ProductVariety,MarketingSpend,CustomerFootfall,StoreSize,EmployeeEfficiency,StoreAge,CompetitorDistance,PromotionsCount,EconomicIndicator,StoreLocation,StoreCategory,MonthlySalesRevenue
0,581,29,1723,186,84.9,1,12,6,108.3,Los Angeles,Electronics,284.9
1,382,31,1218,427,75.8,18,11,6,97.8,Los Angeles,Electronics,308.21
2,449,35,2654,142,92.8,14,11,6,101.1,Los Angeles,Grocery,292.11
3,666,9,2591,159,66.3,11,11,4,115.1,Sacramento,Clothing,279.61
4,657,35,2151,275,89.1,28,12,7,93.4,Palo Alto,Electronics,359.71


In [10]:
#Prepare data so it fits correctly
#Change categorical columns to numbers

#Convert categorical data to numeric data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Import ColumnTransformer
from sklearn.compose import ColumnTransformer

labelencoder = LabelEncoder() #create a class to reuse for transferring labels back and forth

for col in [9, 10]:
    X[:, col] = labelencoder.fit_transform(X[:, col])
#take all the rows (:) except the ninth and tenth indexed columns (9), (10)
#Then use label encoder to fit and transform the values in the these columns to values like 0,1, or 2


# Create a single ColumnTransformer to apply OneHotEncoder to columns 9 and 10
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [9, 10])],  # Apply OneHotEncoder to column indices 9 and 10
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the data using the ColumnTransformer
x = ct.fit_transform(X)


In [11]:
#Avoid dummy variable trap
#removes the first column of the dataset (index 0).
#selects all rows and all columns except the first.
X = X[:, 1:]

In [12]:
#Split data into training and testing set

#train_test_split is a function from the sklearn.model_selection module, which is part of the
#scikit-learn library. It is used to split your data into training and testing subsets.
from sklearn.model_selection import train_test_split

#This represents the features of the dataset. It's typically the input data
#This represents the target or labels of the dataset. It’s the output that you want to predict
#test_size=0.2: This specifies the proportion of the data to use for the test set.
#0.2 means 20% of the data will be used for testing, and the remaining 80% will be used for training.
#random_state=0 just makes sure that the split is random but consistent
#x_train: This contains the features of the training set (80% of the original x).
#x_test: This contains the features of the test set (20% of the original x).
#y_train: This contains the target labels corresponding to x_train.
#y_test: This contains the target labels corresponding to x_test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
#Creating Multiple linear regression model

#This imports the LinearRegression class from the sklearn.linear_model module of the
#scikit-learn library.

from sklearn.linear_model import LinearRegression

#This creates an instance of the LinearRegression class, which represents the model.
#regressor is now an object that holds the Linear Regression model and can be used to perform
#operations like fitting, predicting, and evaluating.
regressor = LinearRegression()

#The fit() method is used to train the Linear Regression model.
#x_train: These are the input features (independent variables) used to predict
#the target variable (y_train).
regressor.fit(X_train, y_train)

In [14]:
#Predicting test set results

#The predict() method takes the test features (x_test), applies the learned coefficients
 #(from fit()), and computes the predicted target values (y_pred) for each observation in x_test.

 #y_pred is an array of predicted target values (dependent variables) for each corresponding
 #input in x_test.

y_pred = regressor.predict(X_test)
y_pred

array([273.22, 218.27, 247.3 , 258.84, 289.78, 284.05, 320.81, 343.81,
       380.03, 381.22, 383.05, 169.91, 369.53, 283.92, 211.68, 275.41,
       310.54, 272.78, 212.99, 345.08, 310.33, 277.61, 293.1 , 234.15,
       266.55, 309.93, 282.61, 254.49, 263.45, 274.15, 280.11, 396.78,
       418.1 , 234.36, 215.32, 334.29, 171.63, 246.09, 391.14, 261.25,
       361.74, 306.26, 365.86, 189.39, 272.61, 306.11, 384.91, 315.19,
       269.5 , 267.65, 416.63, 331.59, 274.64, 281.47, 286.51, 222.51,
       328.56, 383.64, 328.61, 297.48, 255.81, 368.24, 268.21, 216.52,
       405.82, 342.55, 206.31, 240.4 , 299.35, 255.54, 316.78, 258.97,
       172.88, 210.38, 312.1 , 190.99, 313.33, 162.11, 206.16, 207.52,
       297.08, 258.1 , 291.  , 375.57, 323.54, 287.03, 357.03, 256.53,
       311.35, 322.7 , 313.68, 362.07, 294.72, 345.04, 278.18, 247.79,
       177.65, 429.07, 324.96, 257.54, 324.97, 357.05, 297.4 , 401.21,
       183.24, 318.63, 314.75, 295.1 , 304.73, 400.05, 349.04, 375.77,
      

In [15]:
#Calculate coefficients
print(regressor.coef_)

[ 1.01637109e-15  3.66373598e-15  3.40005801e-16  1.10480201e-16
 -1.09721260e-16  8.80643215e-16 -5.80698684e-15  3.18755439e-16
 -8.66602796e-16  2.13622556e-15  1.00000000e+00]


In [16]:
#Calculate the intercept
print(regressor.intercept_)

-7.503331289626658e-12


In [17]:
#Calculate R square
#Check the validity of the model's prediction
from sklearn.metrics import r2_score

#y_test: These are the true values of the target variable (the actual values from your test set).
#y_pred: These are the predicted values generated by the model (the values predicted by the model
# for the test set).

#This calculates the R² score using the actual values (y_test) and the predicted values (y_pred),
# and stores it in the variable score.
score = r2_score(y_test, y_pred)

#This prints the value of the R² score to the console, showing how well the model performed.
print(score)

#R² = 1: Perfect model. The model's predictions match the actual values perfectly, and all
#the variance in the data is explained by the model.
#R² = 0: The model does not explain any of the variance in the target variable. In this case,
#the model's predictions are no better than simply predicting the mean of the target variable.
#R² < 0: The model performs worse than predicting the mean of the target variable. This could
#happen if the model is a poor fit to the data.

1.0
