In [1]:
# L1 Regularization Technique - lasso Regression
# L2 Regularization Technique - Ridge Regression

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('50_Startups.csv')

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
df.shape

(108, 5)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
df.State = le.fit_transform(df.State)

In [8]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [9]:
## X and Y split

X = df.drop(columns=['Profit'], axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


In [10]:
Y = df['Profit']
Y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [11]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()

In [12]:
X_scaled = pd.DataFrame(scale.fit_transform(X), columns = X.columns)
X_scaled.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,1.0,0.651744,1.0,1.0
1,0.983359,0.761972,0.940893,0.0
2,0.927985,0.379579,0.864664,0.5
3,0.873136,0.512998,0.812235,1.0
4,0.859438,0.305328,0.776136,0.5


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_scaled,Y , test_size=0.2, random_state=0)

In [15]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((86, 4), (22, 4), (86,), (22,))

### Model Building

In [16]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [17]:
r = Ridge()
l = Lasso()

In [18]:
r.fit(x_train, y_train)

Ridge()

In [19]:
l.fit(x_train, y_train)

Lasso()

In [28]:
# Prediction of x_test wrt ridge model

pred1 = r.predict(x_test)

pred1

array([ 54556.32416702, 130017.92166782,  84687.15947095, 173295.2223158 ,
       108917.94957822, 128735.89224253, 128736.35934265, 155951.19177423,
       117814.48562718,  52712.59507338, 102790.3781561 , 119096.2726001 ,
        54556.32416702, 124206.72612243,  88379.01243395, 126261.35613731,
       126261.35613731,  98802.1865801 ,  74278.88209886, 141546.67661999,
       145564.21281487, 150251.73759042])

In [30]:
#prediction of x_train wrt ridge model

pred1_train = r.predict(x_train)
pred1_train

array([109172.11701154, 162907.31389068, 100464.139996  , 167970.8492119 ,
       122667.96190528,  49887.13653097,  96390.68310219, 143886.79143792,
        85348.38393902, 188005.75965694, 183451.16760721, 157862.5695242 ,
       127596.0763847 , 118725.97104944, 122667.96190528, 128736.35934265,
       119096.2726001 , 127596.0763847 , 108917.94957822,  93971.40528091,
        65903.04456747, 183451.16760721,  81352.03770302,  75188.38192378,
        79135.42233855, 162907.31389068, 141546.67661999, 132321.48447687,
        81352.03770302,  75752.10844107, 157862.5695242 , 145564.21281487,
       130017.92166782, 132321.48447687, 188005.75965694,  97035.56715902,
       104697.62664321, 117814.48562718, 128735.89224253, 128735.89224253,
       114288.26358192,  93907.91802285, 114288.26358192,  98727.48396016,
       150251.73759042, 128736.35934265,  68760.8213074 , 167970.8492119 ,
        93971.40528091, 173295.2223158 , 145564.21281487,  78309.20334753,
       107823.62811616, 1

In [32]:
# Prediction of x_test wrt lasso model
pred2 = l.predict(x_test)
pred2

array([ 48384.86814735, 134845.52354938,  76486.64641608, 181551.13594979,
       112961.07382208, 134236.64101991, 129218.98004997, 160017.16104325,
       116754.23112994,  46273.04713164, 102272.49339834, 115567.13437352,
        48384.86814735, 119116.48630482,  88593.22703248, 127104.80005829,
       127104.80005829,  90948.41312188,  58678.78647171, 146299.80323437,
       149413.8490298 , 152502.10158276])

In [33]:
# Prediction of x_train wrt lasso model

pred2_train = l.predict(x_train)
pred2_train

array([110345.06847036, 171315.14087949,  98167.3495588 , 173976.20312604,
       116391.72815898,  48721.24977913,  98703.60466908, 155799.50522602,
        83170.9100649 , 193321.34343103, 188881.62393281, 163693.83927011,
       129123.17340829, 116686.52466738, 116391.72815898, 129218.98004997,
       115567.13437352, 129123.17340829, 112961.07382208,  97485.61160597,
        60875.10406774, 188881.62393281,  75063.94710207,  70422.60445166,
        75022.69207191, 171315.14087949, 146299.80323437, 131295.79320222,
        75063.94710207,  70551.48514181, 163693.83927011, 149413.8490298 ,
       134845.52354938, 131295.79320222, 193321.34343103,  89804.0356455 ,
       102129.68691995, 116754.23112994, 134236.64101991, 134236.64101991,
       110753.19950999,  91202.84834487, 110753.19950999,  98111.94129321,
       152502.10158276, 129218.98004997,  64563.00320539, 173976.20312604,
        97485.61160597, 181551.13594979, 149413.8490298 ,  71073.04408273,
       114694.60730979, 1

In [34]:
# Comparision btw ridge model prediction and lasso model prediction

profit = pd.DataFrame({'Actual Profit': y_test,
                       'Ridge_pred':pred1,
                       'Lasso_pred':pred2})
profit.head()

Unnamed: 0,Actual Profit,Ridge_pred,Lasso_pred
84,64926.08,54556.324167,48384.868147
10,146121.95,130017.921668,134845.523549
75,90708.19,84687.159471,76486.646416
2,191050.39,173295.222316,181551.13595
24,108552.04,108917.949578,112961.073822


In [36]:
### Evaluating Ridge and Lasso Models

In [35]:
from sklearn import metrics

In [40]:
# r2 score
print(metrics.r2_score(y_test, pred1)) # Ridge Model
print(metrics.r2_score(y_test, pred2)) # Lasso Model

0.9095565216441845
0.9259035724996549


In [41]:
# rmse

# Ridge Model
print(np.sqrt(metrics.mean_squared_error(y_test, pred1)))

# Lasso Model
print(np.sqrt(metrics.mean_squared_error(y_test, pred2)))

10825.266082933198
9798.25158543825
