<a href="https://colab.research.google.com/github/Mjcherono/TrialProjects/blob/main/Python_Programming_Elastic_Net_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: Elastic Net Regression 

## Example

In [125]:
# Example 1
# ---
# Use the fair dataset from the pydataset library to predict marriage satisfaction based on the given variables.
# ---
# 
!pip install pydataset



In [126]:
# Importing our libraries
# 
from pydataset import data
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [127]:
# Data preparation
# 
df=pd.DataFrame(data('Fair'))
df.loc[df.sex== 'male', 'sex'] = 0
df.loc[df.sex== 'female','sex'] = 1
df['sex'] = df['sex'].astype(int)
df.loc[df.child== 'no', 'child'] = 0
df.loc[df.child== 'yes','child'] = 1
df['child'] = df['child'].astype(int)
X=df[['religious','age','sex','ym','education','occupation','nbaffairs']]
y=df['rate']

In [128]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

# This mean standard error score of 1.05 is our benchmark for determining 
# if the elastic net model will be better or worst. 

1.049873864469667


In [129]:
# Below are the coefficients of this first model. We use a for loop to go through 
# the model and the zip function to combine the two columns.
# 
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baselinecoef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'age': -0.00905964542867384,
 'education': 0.06810255742293703,
 'nbaffairs': -0.07882571247653965,
 'occupation': -0.00597950685299818,
 'religious': 0.04235281110639179,
 'sex': 0.0888201333708709,
 'ym': -0.030458802565476555}

In [130]:
# Elastic Net Model
# Elastic net, just like ridge and lasso regression, requires normalize data. 
# This argument  is set inside the ElasticNet function. 
# The second thing we need to do is create our grid.
# 
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [131]:
# We will now fit our model and display the best parameters and the best results we can get with that setup.
# 
search.fit(X,y)
search.best_params_
abs(search.best_score_)

1.0819158709244472

In [132]:
elastic=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

1.0566430678343806


In [133]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

# The coefficients are mostly the same. 
# Notice that occupation was completely removed from the model in the elastic net version. 
# This means that this values was no good to the algorithm. Traditional regression cannot do this.

{'age': -0.008630896492807693,
 'education': 0.04429085595448633,
 'nbaffairs': -0.06679513627963515,
 'occupation': -0.0,
 'religious': 0.019475417249578564,
 'sex': 0.01811646456809079,
 'ym': -0.02422483127451297}

## Challenges

### <font color="green">Challenge 1</font>

In [158]:
# Challenge 1
# ---
# Question: Using the given housiet, create a regression model to predict 
# the value of prices of a house using the given features. 
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
#

housiet = pd.read_csv('http://bit.ly/BostonHousingDataset')
housiet.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [159]:
housiet['Prices'] = housiet['tax'] * housiet['ptratio']
housiet.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,Prices
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,4528.8
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,4307.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,4307.6
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,4151.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,4151.4


In [160]:
housiet.dtypes

crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv       float64
Prices     float64
dtype: object

In [161]:
#Splitting
X = housiet.iloc[:,0:14]
y = housiet['Prices']


In [162]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model) 

18657.889050348454


In [163]:
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'age': 0.3005428132458301,
 'b': 0.13429051541081094,
 'chas': -30.094494357915412,
 'crim': -0.2381931213522644,
 'dis': -25.10051822782158,
 'indus': -0.30187858463276696,
 'lstat': 6.904054043483076,
 'medv': 2.900119935837742,
 'nox': -538.8065092291724,
 'ptratio': 342.86408386774986,
 'rad': 21.615770835237395,
 'rm': 38.79155601028663,
 'tax': 18.87685987278246,
 'zn': 1.9475566590360849}

In [164]:
#Elastic net model, create grid
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [168]:
#fit model and display the best parameters 
search.fit(X,y)
search.best_params_


{'alpha': 1e-05, 'l1_ratio': 0.8}

In [169]:
abs(search.best_score_)

38036.586669441844

In [170]:
elastic=ElasticNet(normalize=True,alpha=1e-05,l1_ratio=0.75)
elastic.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(second_model)

18789.856504659874


In [171]:
#Checking the coefficients
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'age': 0.3005428132458301,
 'b': 0.13429051541081094,
 'chas': -30.094494357915412,
 'crim': -0.2381931213522644,
 'dis': -25.10051822782158,
 'indus': -0.30187858463276696,
 'lstat': 6.904054043483076,
 'medv': 2.900119935837742,
 'nox': -538.8065092291724,
 'ptratio': 342.86408386774986,
 'rad': 21.615770835237395,
 'rm': 38.79155601028663,
 'tax': 18.87685987278246,
 'zn': 1.9475566590360849}

### <font color="green">Challenge 2</font>

In [172]:
# Challenge 2
# ---
# Question: Using the Ames Housing dataset, create a regression model to predict the sales price of home 
# applying elastic net regression.
# ---
# Dataset Source = http://bit.ly/HousePricesDataset
# 
house_train = pd.read_csv('/content/train.csv')

In [173]:
house_train.shape

(1460, 81)

In [145]:
house_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'P

In [146]:
house_train.drop(['Id'], axis=1,inplace=True)

In [149]:
X = house_train.drop(['SalePrice'],axis=1)
y = house_train['SalePrice']

In [151]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
#regression.fit(X,y)
#first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
#print(first_model)

### <font color="green">Challenge 3</font>

In [None]:
# Challenge 3
# ---
# Question: Given the medical cost personal dataset, accurately predict insurance cost using a regression model.
# ---
# Dataset Source = http://bit.ly/https://bit.ly/insurance-_dataset
# 
insurance = pd.read_csv('http://bit.ly/https://bit.ly/insurance-_dataset')
insurance.head()

### <font color="green">Challenge 4</font>

In [None]:
# Challenge 4
# ---
# Question: Use ElasticNet regression to build a model that is able to accurately predict the profits of a startup.
# ---
# Dataset Source = http://bit.ly/StartupsDataset
# ---
# 
OUR CODE STARTS HERE

### <font color="green">Challenge 5</font>

In [174]:
# Challenge 5
# ---
# Question: Build a prediction model to predict duration for any combination of country,operator, 
# services and category given the genre,language and number of units. 
# Apply ElasticNet regression while building your model. 
# ---
# Dataset Source = https://bit.ly/Audio_content_consumption
# ---
# 
duration = pd.read_csv('/content/data.csv', encoding='latin')
duration.head()

Unnamed: 0,language_name,genre,service,country,mobile_operator,usage_category,number_of_units,duration
0,afrikaans,pop,music on demand,chad,airtel,int,2,9
1,arabic,inspirational,music on demand,zambia,airtel,ramadanc,3,181
2,arabic,islamic,music on demand,chad,airtel,islam,61,2102
3,arabic,islamic,music on demand,chad,airtel,quran,11,179
4,arabic,world,music on demand,zambia,airtel,ramadanc,3,24


In [175]:
from sklearn.preprocessing import LabelEncoder
categorical_features = ['language_name', 'genre', 'service','country','mobile_operator','usage_category']
le = LabelEncoder()

# Converting the variables to numerical
#
for i in range(6):
    new = le.fit_transform(duration[categorical_features[i]])
    duration[categorical_features[i]] = new
duration.head()

Unnamed: 0,language_name,genre,service,country,mobile_operator,usage_category,number_of_units,duration
0,0,48,1,0,0,10,2,9
1,1,31,1,2,0,21,3,181
2,1,34,1,0,0,11,61,2102
3,1,34,1,0,0,20,11,179
4,1,62,1,2,0,21,3,24


In [176]:
X = duration.iloc[:,0:7]
y = duration.iloc[:,7]

In [177]:
X.head()

Unnamed: 0,language_name,genre,service,country,mobile_operator,usage_category,number_of_units
0,0,48,1,0,0,10,2
1,1,31,1,2,0,21,3
2,1,34,1,0,0,11,61
3,1,34,1,0,0,20,11
4,1,62,1,2,0,21,3


In [178]:
# Creating our linear regression model for the purpose of comparison
# 
regression=LinearRegression()
regression.fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model)

993723097.0287952


In [179]:
#coefficients of the first model
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'country': -346.725121908944,
 'genre': 100.41628839739809,
 'language_name': -55.667724267781836,
 'mobile_operator': -39644.494395721966,
 'number_of_units': 41.84253902131246,
 'service': -39931.18481793762,
 'usage_category': 150.23883256371093}

In [180]:
#Applying elastic model
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

ENreg = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)

ENreg.fit(X_train,y_train)

ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [181]:
#Making y prediction
y_pred = ENreg.predict(X_test)
y_pred

array([ -4280.74590281,  -5053.41158887,  13037.9860409 ,   3325.4145027 ,
         2054.86149613,  -3563.41387844,  -2030.99921043,   1843.12742294,
         3789.4982734 ,    803.85541612,  -2931.03059154,   1150.5965335 ,
        21608.72885009,   1537.83948477,   2947.43918616,  -4395.92766107,
        11002.91875688,  22671.8395389 ,   -196.88286481,    517.19858052,
         -339.19889765,   -441.33326452,  -1197.95948202,   2414.8829387 ,
       179868.3410046 ,  26218.87656744,  -3976.44593385, 102537.03227622,
        -3147.54550563,  -3351.50028797,   6365.32384542,  -3203.72505104,
          917.49719592,  -2081.84821164,  35768.72583691,  -4631.79796941,
         -540.93821707,  -1234.16525895,  33119.97586803,  -2857.65504244,
        -2224.86866966,  -1813.23634657,   2107.73151893])

In [182]:
#calculating mse

mse = np.mean((y_pred - y_test)**2)
mse


168948618.64337328

In [183]:
#elastic net reg score
ENreg.score(X_test,y_test)


0.9083290251109398

In [184]:
# Below are the coefficients
# 
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'country': -34.96377156345572,
 'genre': 2.101635296850346,
 'language_name': -0.2723425672947244,
 'mobile_operator': -532.1245043142961,
 'number_of_units': 0.3264399469343557,
 'service': 1.2056132072647885,
 'usage_category': 38.40044156295092}