In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load Dataset
data = pd.read_csv("50_Startups.csv")

In [3]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# One-Hot Encoding for State Column
data = pd.get_dummies(data, drop_first=True, dtype=int)
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [6]:
# Independent & Dependent Variables
X = data.drop('Profit', axis=1)
print(X.head())

   R&D Spend  Administration  Marketing Spend  State_Florida  State_New York
0  165349.20       136897.80        471784.10              0               1
1  162597.70       151377.59        443898.53              0               0
2  153441.51       101145.55        407934.54              1               0
3  144372.41       118671.85        383199.62              0               1
4  142107.34        91391.77        366168.42              1               0


In [7]:
y = data['Profit']
print(y.head())

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64


In [8]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

In [9]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
12,93863.75,127320.38,249839.44,1,0
4,142107.34,91391.77,366168.42,1,0
37,44069.95,51283.14,197029.42,0,0
8,120542.52,148718.95,311613.29,0,1
3,144372.41,118671.85,383199.62,0,1


In [10]:
# Standardization
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
X_test_scaled

array([[ 0.30245367,  0.52942836,  0.14916233, -0.73379939, -0.69388867],
       [-0.82734624, -1.40769369, -0.53560477, -0.73379939, -0.69388867],
       [-0.33181874, -0.20294703, -1.27505783,  1.36277029, -0.69388867],
       [-1.62147425,  0.11103854, -2.06176266, -0.73379939,  1.44115338],
       [ 0.35879726,  0.88291223,  0.41286919, -0.73379939,  1.44115338],
       [-1.63116196, -2.56004955, -2.07854935, -0.73379939,  1.44115338],
       [-0.04987791,  0.84817808, -0.89664846,  1.36277029, -0.69388867],
       [-0.2753597 ,  0.67912498, -0.86215204, -0.73379939, -0.69388867],
       [-0.30191325,  0.29793642, -1.67222209, -0.73379939, -0.69388867],
       [ 0.18462534,  1.19412269, -2.07854935, -0.73379939,  1.44115338]])

In [21]:
X_train_scaled[:,0].reshape(-1,1)

array([[ 0.34202149],
       [ 1.36207849],
       [-0.71081297],
       [ 0.90611438],
       [ 1.40997088],
       [ 1.20367103],
       [-1.05285826],
       [-1.61480906],
       [-1.642623  ],
       [ 0.77885123],
       [ 0.96515572],
       [ 0.00687736],
       [-0.01361318],
       [-0.66099544],
       [-0.34996231],
       [ 1.85350175],
       [-1.17369938],
       [-0.11798808],
       [-0.46926521],
       [ 1.14576723],
       [-0.25546817],
       [ 0.48597351],
       [-1.03655971],
       [ 1.79532434],
       [ 0.01483507],
       [ 1.6017269 ],
       [-1.3147716 ],
       [-0.66970778],
       [-0.21472284],
       [-1.03464471],
       [ 0.51221561],
       [-0.07809041],
       [ 0.29731084],
       [-1.642623  ],
       [-0.03032003],
       [ 1.11238589],
       [-1.14276186],
       [ 0.89344327],
       [-0.24603715],
       [-1.21489084]])

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled[:,0].reshape(-1,1), y_train)

In [23]:
y_pred = model.predict(X_test_scaled[:,0].reshape(-1,1))

In [24]:
y_pred

array([127862.20996405,  82250.56334619, 102255.72078164,  50190.4734756 ,
       130136.88190548,  49799.36685472, 113638.0775288 , 104535.0534288 ,
       103463.04843113, 123105.31102669])

In [25]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("SLR R2 score: ", r2)

SLR R2 score:  0.9265108109341951
