<a href="https://colab.research.google.com/github/GaneshPechetti/FML/blob/main/Lasso_MultiVarRegression_50Startups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Lasso

In [None]:
#Reading and Exploring data
data = pd.read_csv("/content/50_Startups.csv")
print(data.shape)
print()
print(data.info())
print(data.head())

(50, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB
None
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [None]:
#Ordinal Encoding

#from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()
mapper = {"New York":1,"California":2,"Florida":3}
data["State"] = data["State"].replace(mapper)
print(data.head())


   R&D Spend  Administration  Marketing Spend  State     Profit
0  165349.20       136897.80        471784.10      1  192261.83
1  162597.70       151377.59        443898.53      2  191792.06
2  153441.51       101145.55        407934.54      3  191050.39
3  144372.41       118671.85        383199.62      1  182901.99
4  142107.34        91391.77        366168.42      3  166187.94


In [None]:
#checking for null values
data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [None]:
#checking correlation
data.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
R&D Spend,1.0,0.241955,0.724248,0.03793,0.9729
Administration,0.241955,1.0,-0.032154,0.003026,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.137777,0.747766
State,0.03793,0.003026,0.137777,1.0,0.048471
Profit,0.9729,0.200717,0.747766,0.048471,1.0


In [None]:
#preparing data columns with state column
X = data.drop("Profit",axis=1).values
y = data["Profit"].values
print(X.shape,y.shape)

#preparing data columns without state column
X_c = data.drop(["Profit","State"],axis=1).values
print(X_c.shape)

(50, 4) (50,)
(50, 3)


In [None]:
#reshaping y column
y = y.reshape(-1,1)
y.shape

(50, 1)

**With State Column**

---


In [None]:
#preparing training data and spliting data with state column 
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y)
print(train_X.shape,train_y.shape,test_X.shape,test_y.shape)

(37, 4) (37, 1) (13, 4) (13, 1)


In [None]:
#preparing a model and training it
from sklearn import linear_model
model = Lasso(alpha=0.1)
model.fit(train_X,train_y)

In [None]:
#Evaluating the model on training data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
pred_y = model.predict(train_X)
print(f"MAE : {mean_absolute_error(train_y,pred_y)}")
print(f"MSE : {mean_squared_error(train_y,pred_y)}")
print(f"RMSE : {mean_squared_error(train_y,pred_y)**0.5}")
print(f"R_2 : {r2_score(train_y,pred_y)}")

MAE : 6756.319406468248
MSE : 86925496.12887844
RMSE : 9323.384370971651
R_2 : 0.9490032040181675


In [None]:
#Evaluating the model on testing data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
pred_y = model.predict(test_X)
print(f"MAE : {mean_absolute_error(test_y,pred_y)}")
print(f"MSE : {mean_squared_error(test_y,pred_y)}")
print(f"RMSE : {mean_squared_error(test_y,pred_y)**0.5}")
print(f"R_2 : {r2_score(test_y,pred_y)}")

MAE : 6290.765055283262
MSE : 62416359.66478861
RMSE : 7900.40250017609
R_2 : 0.9509342056981636


**Without State column**

---



In [None]:
#preparing training data and spliting data without state column 
from sklearn.model_selection import train_test_split
train_Xc,test_Xc,train_yc,test_yc = train_test_split(X_c,y)
print(train_Xc.shape,train_yc.shape,test_Xc.shape,test_yc.shape)

(37, 3) (37, 1) (13, 3) (13, 1)


In [None]:
#preparing a model and training it
from sklearn import linear_model
model2 = Lasso(alpha=0.1)
model2.fit(train_Xc,train_yc)

In [None]:
#Evaluating the model on train data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
pred_y = model2.predict(train_Xc)
print(f"MAE : {mean_absolute_error(train_yc,pred_y)}")
print(f"MSE : {mean_squared_error(train_yc,pred_y)}")
print(f"RMSE : {mean_squared_error(train_yc,pred_y)**0.5}")
print(f"R_2 : {r2_score(train_yc,pred_y)}")

MAE : 7052.29082392313
MSE : 89698574.192593
RMSE : 9470.933121535227
R_2 : 0.9470678740114679


In [None]:
#Evaluating the model on test data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
pred_y = model2.predict(test_Xc)
print(f"MAE : {mean_absolute_error(test_yc,pred_y)}")
print(f"MSE : {mean_squared_error(test_yc,pred_y)}")
print(f"RMSE : {mean_squared_error(test_yc,pred_y)**0.5}")
print(f"R_2 : {r2_score(test_yc,pred_y)}")

MAE : 4926.2747120998065
MSE : 48222967.92738179
RMSE : 6944.275910948656
R_2 : 0.9571357323905926
