MULTIPLE LINEAR REGRESSION WITH CLEANED DATA

In [1]:
# IMPORTING REQUIRED PACKAGES

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# READING DATASET

In [4]:
df = pd.read_csv('50_Startups.csv')

In [5]:
df.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [6]:
# CHECKING FULL INFORMATION

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [8]:
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [9]:
# creating dummies 

In [10]:
a = pd.get_dummies(df['State'] , drop_first=True)
a

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0
5,0,1
6,0,0
7,1,0
8,0,1
9,0,0


In [11]:
df = pd.concat([df , a] , axis = 1)

In [12]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Florida,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [13]:
df = df.drop(['State'],axis = 1)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


### spliting the data 

In [14]:
X = df.drop(['Profit'],axis = 1)  # independent 
y = df['Profit'] # dependent 


In [15]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [16]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [17]:
# checking null values

In [18]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
Profit             0
Florida            0
New York           0
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df.shape

(50, 6)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=11)

In [22]:
len(X_train) , len(y_train)

(33, 33)

In [23]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
39,38558.51,82982.09,174999.3,0,0
49,0.0,116983.8,45173.06,0,0
21,78389.47,153773.43,299737.29,0,1
15,114523.61,122616.84,261776.23,0,1
0,165349.2,136897.8,471784.1,0,1


In [24]:
# import and fit 

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
reg = LinearRegression()

In [27]:
# y = m1x1 + m2x2 + m3x3 + m4x4 + m4x5 + c

In [28]:
reg.fit(X_train,y_train)

LinearRegression()

In [29]:
# for m(slopes)

reg.coef_

array([ 8.64329624e-01,  7.12539493e-03,  3.06409736e-02, -5.50485207e+02,
       -6.33665237e+03])

In [30]:
# for c(intercept)

reg.intercept_

41591.69166575266

In [31]:
# Making predictions for training data

In [32]:
X_train.shape

(33, 5)

In [33]:
y_train.shape

(33,)

In [34]:
y_train_1 = y_train.values.reshape(-1,1)
y_train_1.shape

(33, 1)

In [35]:
y_train_pred = reg.predict(X_train)
y_train_pred

array([ 80872.38320029,  43809.39397757, 113289.31921754, 143135.96001129,
       193602.62617354, 109503.50116604,  70793.02387907, 136911.90949281,
       162906.03936979, 172627.58336062,  91888.17757024,  96333.03728628,
        72584.22587942,  42556.66195438,  52107.31005232, 117462.88141241,
       136937.44188854, 196809.80879104,  62731.82763051,  89301.79657932,
       175739.74952806,  98730.68546406, 103053.08951902, 106861.22727947,
       130192.89375812, 164622.59609965,  96317.21624647, 130732.95786131,
       129810.79484262, 126762.11381586, 109312.75528664, 117986.98489419,
       102704.99651151])

In [38]:
train_data_comparision = pd.DataFrame({'y_train_Actual': y_train , 'y_train_predictions': y_train_pred})
train_data_comparision

Unnamed: 0,y_train_Actual,y_train_predictions
39,81005.76,80872.3832
49,14681.4,43809.393978
21,111313.02,113289.319218
15,129917.04,143135.960011
0,192261.83,193602.626174
23,108733.99,109503.501166
41,77798.83,70793.023879
11,144259.4,136911.909493
6,156122.51,162906.03937
3,182901.99,172627.583361


In [42]:
# training accuracy

In [43]:
from sklearn.metrics import r2_score

r2_score(y_train,y_train_pred)

0.9624396264097772

Test Data Performance

In [44]:
y_test_pred = reg.predict(X_test)

In [45]:
r2_score(y_test,y_test_pred)

0.8829561428747645

In [46]:
# checking with real data 

In [53]:
import warnings
warnings.filterwarnings('ignore')

In [54]:
reg.predict([[1.1,10.9,12,0,0]])

array([41593.08778683])