In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [34]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [35]:
#Splitting data into (Dependant - independant)
X = dataset.iloc[:, :-1].values
y = dataset['Profit'].values

#Label Encoding
label_encoder = LabelEncoder()
X[:,-1] = label_encoder.fit_transform(X[:,-1])
X

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

In [36]:
train_set, test_set, goal_train, goal_test = train_test_split(X, y, test_size=0.2, random_state = 0)

# MinMaxScaler (label encoder)

In [37]:
scaler = MinMaxScaler()
train_set = scaler.fit_transform(train_set)
test_set  = scaler.transform(test_set)

In [38]:
model = LinearRegression()
model.fit(train_set,goal_train)

In [39]:
goal_predict = model.predict(test_set)

In [40]:
r2_score(goal_test, goal_predict)

0.9386861070938136

# RobustScalar (label encoder)

In [41]:
scaler = RobustScaler()
train_set = scaler.fit_transform(train_set)
test_set  = scaler.transform(test_set)

In [42]:
model = LinearRegression()
model.fit(train_set, goal_train)
goal_predict = model.predict(test_set)

In [43]:
r2_score(goal_test, goal_predict)

0.9386861070938134

In [48]:
#one Hot Encoding
newX = dataset.iloc[:, :-1].values
newy = dataset.iloc[:, -1].values
hot_encoder = OneHotEncoder()
newX[:,-1]= hot_encoder.fit_transform(newX[:, -1].reshape(1, -1)).toarray()

In [49]:
train_set, test_set, goal_train, goal_test = train_test_split(newX, newy, test_size=0.2, random_state = 0)

# MinMaxScaler (OneHotEncoder)

In [50]:
scaler = MinMaxScaler()
train_set = scaler.fit_transform(train_set)
test_set  = scaler.transform(test_set)

In [51]:
model = LinearRegression()
model.fit(train_set,goal_train)
goal_predict = model.predict(test_set)
r2_score(goal_test, goal_predict)

0.9393955917820572

# RobustScaler (OneHotEncoder)

In [52]:
scaler = RobustScaler()
train_set = scaler.fit_transform(train_set)
test_set  = scaler.transform(test_set)

In [53]:
model = LinearRegression()
model.fit(train_set, goal_train)
goal_predict = model.predict(test_set)
r2_score(goal_test, goal_predict)

0.9393955917820572