In [1]:
# import the data
# sep x,y
# find out the coef , intercept
# y = m1*x1 + m2*x2 + m3*x3 + c
# lr.predict([[12,20,32]])
# calculate r2_score

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('supershop.csv')

In [4]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Marketing Spend  50 non-null     float64
 1   Administration   50 non-null     float64
 2   Transport        49 non-null     float64
 3   Area             50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

# Filling the missing data

In [7]:
df.Transport.mean()

215331.73244897963

In [8]:
df.Transport = df.Transport.fillna(df.Transport.mean())

In [9]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [10]:
df.shape

(50, 5)

# Feature Scaling : Normalization 

In [11]:
df1 = df.copy()

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
minmax = MinMaxScaler()

In [14]:
df1['Marketing Spend'] = minmax.fit_transform(df1[['Marketing Spend']])

In [15]:
df1['Administration'] = minmax.fit_transform(df1[['Administration']])

In [16]:
df1['Transport'] = minmax.fit_transform(df1[['Transport']])

In [17]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,192261.83
1,0.983359,0.761972,0.940893,Ctg,191792.06
2,0.927985,0.379579,0.864664,Rangpur,191050.39
3,0.873136,0.512998,0.812235,Dhaka,182901.99
4,0.859438,0.305328,0.776136,Rangpur,166187.94


In [18]:
df1.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

# Label Encoding for the column Area

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
le = LabelEncoder()

In [21]:
df1.Area =le.fit_transform(df1.Area) 

In [22]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,1,192261.83
1,0.983359,0.761972,0.940893,0,191792.06
2,0.927985,0.379579,0.864664,2,191050.39
3,0.873136,0.512998,0.812235,1,182901.99
4,0.859438,0.305328,0.776136,2,166187.94


In [23]:
df1.Area.unique()  #Dhaka = 1, Chittagong = 0, Rangpur = 2

array([1, 0, 2])

# Splitting the data set

In [24]:
y = df1.Profit
x = df1.drop('Profit', axis =1)

In [25]:
from sklearn.model_selection import train_test_split as tts

In [26]:
xtrain, xtest, ytrain, ytest = tts(x,y,train_size=.70,random_state=1) # test_size=.30

In [27]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
42,0.142976,0.341852,0.313705,0
49,0.0,0.500148,0.095749,0
26,0.455574,0.706845,0.284134,2
22,0.447505,0.544293,0.64292,2
13,0.556352,0.641066,0.535552,0


In [28]:
ytrain.head()

42     71498.49
49     14681.40
26    105733.54
22    110352.25
13    134307.35
Name: Profit, dtype: float64

In [29]:
xtest.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
27,0.436093,0.582978,0.748613,1
35,0.278284,0.257032,0.435618,1
40,0.173901,0.512041,0.36626,0
38,0.122345,0.111636,0.39269,1
2,0.927985,0.379579,0.864664,2


In [30]:
ytest.head()

27    105008.31
35     96479.51
40     78239.91
38     81229.06
2     191050.39
Name: Profit, dtype: float64

# Applying model

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lr = LinearRegression() # creating object for linear model

In [33]:
lr.fit(xtrain,ytrain)

# Finding coefficient and intercept 

In [34]:
lr.coef_

array([91715.15170633, 11456.16241587, 55719.8970702 ,  -462.96480388])

In [35]:
lr.intercept_

40599.16071991212

In [36]:
lr.predict([[0.447505,0.544293,0.642920,2]])



array([122775.1653107])

In [37]:
lr.predict([[0.436093,0.582978,0.748613,1]])



array([128523.8615274])

In [38]:
predricted_profit = lr.predict(xtest)

In [39]:
predricted_profit

array([128523.85817344,  92876.23166134,  82822.47768709,  74516.64570607,
       177310.96394258, 171350.68669007,  40476.97540817,  98053.04881798,
        81121.43605441,  93310.799139  ,  88010.25155613,  85419.3298334 ,
       127955.49740722,  85934.52575582, 122418.36491094])

In [40]:
xtest['predricted_profit'] = lr.predict(xtest)

In [41]:
xtest.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,predricted_profit
27,0.436093,0.582978,0.748613,1,128523.858173
35,0.278284,0.257032,0.435618,1,92876.231661
40,0.173901,0.512041,0.36626,0,82822.477687
38,0.122345,0.111636,0.39269,1,74516.645706
2,0.927985,0.379579,0.864664,2,177310.963943


In [42]:
import pandas as pd

In [43]:
new_xtest = pd.concat([xtest,ytest], axis =1)

In [44]:
new_xtest.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,predricted_profit,Profit
27,0.436093,0.582978,0.748613,1,128523.858173,105008.31
35,0.278284,0.257032,0.435618,1,92876.231661,96479.51
40,0.173901,0.512041,0.36626,0,82822.477687,78239.91
38,0.122345,0.111636,0.39269,1,74516.645706,81229.06
2,0.927985,0.379579,0.864664,2,177310.963943,191050.39


# r2 Score

In [45]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score = r2_score(ytest, predricted_profit)
print('r2 socre is  ',score)
print('mean_sqrd_error is== ',mean_squared_error(ytest,predricted_profit))

r2 socre is   0.8986200525800975
mean_sqrd_error is==  160798857.87786588
