In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
Data=pd.read_csv('Data_multiple_linear_regression.csv')
print(Data.head())
Independent_Variables=Data.iloc[:,:-1].values
Dependent_Variables=Data.iloc[:,4].values
print(Independent_Variables)
print(Dependent_Variables)

   Miscellaneous_Expenses  Food_Innovation_Spend  Advertising     City  \
0               138671.80              167497.20    475918.10  Chicago   
1               153151.59              164745.70    448032.53   Mumbai   
2               102919.55              155589.51    412068.54    Tokyo   
3               120445.85              146520.41    387333.62  Chicago   
4                93165.77              144255.34    370302.42    Tokyo   

      Profit  
0  202443.83  
1  201974.06  
2  201232.39  
3  193083.99  
4  176369.94  
[[138671.8 167497.2 475918.1 'Chicago']
 [153151.59 164745.7 448032.53 'Mumbai']
 [102919.55 155589.51 412068.54 'Tokyo']
 [120445.85 146520.41 387333.62 'Chicago']
 [93165.77 144255.34 370302.42 'Tokyo']
 [101588.71 134024.9 366995.36 'Chicago']
 [148972.87 136763.46 131850.82 'Mumbai']
 [147304.06 132446.13 328010.68 'Tokyo']
 [150492.95 122690.52 315747.29 'Chicago']
 [110453.17 125482.88 309115.62 'Mumbai']
 [112368.11 104061.08 233294.95 'Tokyo']
 [93564.6

In [3]:
#Converting categorical column into numerical values
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
le=LabelEncoder()
Independent_Variables[:,3]=le.fit_transform(Independent_Variables[:,3])

In [5]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [3])],
    remainder='passthrough'
)

In [6]:
Independent_Variables=ct.fit_transform(Independent_Variables)
Independent_Variables=Independent_Variables[:, [0,1,3,4,5] ]
print(Independent_Variables.astype(int))

[[     1      0 138671 167497 475918]
 [     0      1 153151 164745 448032]
 [     0      0 102919 155589 412068]
 [     1      0 120445 146520 387333]
 [     0      0  93165 144255 370302]
 [     1      0 101588 134024 366995]
 [     0      1 148972 136763 131850]
 [     0      0 147304 132446 328010]
 [     1      0 150492 122690 315747]
 [     0      1 110453 125482 309115]
 [     0      0 112368 104061 233294]
 [     0      1  93564 102819 253878]
 [     0      0 129094  96011 253973]
 [     0      1 137269  94140 256798]
 [     0      0 158321 122091 260646]
 [     1      0 124390 116671 265910]
 [     0      1 123371  80161 268480]
 [     1      0 146851  96805 286708]
 [     0      0 115949  93897 299053]
 [     1      0 155288  88567   4134]
 [     0      1 115641  78401 302798]
 [     1      0 155547  80537 303871]
 [     0      0 124556  76142 307453]
 [     0      0 107525  69680 308902]
 [     1      0 101055  79192 144708]
 [     0      1 141327  66812 142096]
 [     0    

In [7]:
#Splitting the training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Independent_Variables, Dependent_Variables, test_size = 0.3, random_state = 0)
model = LinearRegression()
model.fit(X_train, y_train)
print(f'constant = {model.intercept_}')
print(f'coefficients = {model.coef_}')

constant = 51168.39042488141
coefficients = [-1.58229418e+02 -4.63028992e+02  3.01968165e-02  7.90840255e-01
  3.10148566e-02]


In [8]:
#Predicting from the model on the testing data
y_predict = model.predict(X_test)
df = pd.DataFrame(data=y_test, columns=['y_test'])
df['y_predict'] = y_predict
print(df)


       y_test      y_predict
0   113464.38  114464.764722
1   154441.40  142718.884992
2   156303.95  144092.850078
3    87980.83   82766.774894
4   201232.39  190102.927619
5   115190.31  124731.310792
6    91411.06   76626.432613
7   107665.56  108586.968401
8   120534.25  124681.828086
9   176369.94  179549.506399
10  106960.92  106704.625400
11  106661.51   98222.671829
12  115915.54  121131.994055
13  106894.80  100601.189785
14  134448.90  138202.462501


In [9]:
#Predicting based on the custom values of independent varibles
obs = [0,1,160349,134320,401409]
feature_array = np.array(obs)
feature_array = feature_array.reshape(1, -1)
y_pred_obs = model.predict(feature_array)
print(round(float(y_pred_obs), 2))

174222.7


In [10]:
#Evaluating the model using R Suared error method
from sklearn import metrics
model_eval= metrics.r2_score(y_test, y_predict)
print('R-Square Error:', model_eval)

R-Square Error: 0.9358680970046511


In [11]:
#Adjusting the R squared error
n = 50
p = 5
Adjusted_r_square = 1-(1-model_eval)*(n-1)/(n-p-1)
print('Adjusted R-Square Error :', Adjusted_r_square)

Adjusted R-Square Error : 0.9285803807551797
