In [20]:
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

I am using this data to build a model to predict cars that will be fuel efficient
EPA Site says average mpg for now is 24.7 lets say anything over 25 is fuel efficient for the purposes of this model

In [21]:
data = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\MPG_Prediction\mpg.csv')
# Drop ? entries from horsepower and convert from object to integers 
data = data[data['horsepower']!='?']
data['horsepower'] = data.loc[:,'horsepower'].apply(lambda x: int(x))
# Create binary indicator showing if cars are fuel efficient (eg. mpg >= 25)
data['Fuel_Efficient?'] = data.loc[:,'mpg'] >= 25

In [22]:
# Collect variables of interest for regression
Y = pd.DataFrame(data['mpg'])
X = pd.DataFrame(data[['cylinders','displacement','horsepower','weight','acceleration','model_year']])
len(X)

392

# First using linear regressions 

In [23]:
# initialize each regression
ols_reg = linear_model.LinearRegression()
rid_reg = linear_model.Ridge(alpha=10)
las_reg = linear_model.Lasso(alpha=10)

# Fit trendlines to data
ols_reg.fit(X,Y)
rid_reg.fit(X,Y)
las_reg.fit(X,Y)

Lasso(alpha=10, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
print('-'*100)
print('''FOR AN ORDINARY LEAST SQUARES REGRESSION:''')
print('-'*100)
print("Coefficients:\n",ols_reg.coef_)
print("\nIntercept:",ols_reg.intercept_)
print("\nR^2 Value:",ols_reg.score(X,Y))

print('-'*100)
print('''FOR A RIDGE REGRESSION:''')
print('-'*100)
print("Coefficients:\n",rid_reg.coef_)
print("\nIntercept:",rid_reg.intercept_)
print("\nR^2 Value:",rid_reg.score(X,Y))

print('-'*100)
print('''FOR A LASSO REGRESSION:''')
print('-'*100)
print("Coefficients:\n",las_reg.coef_)
print("\nIntercept:",las_reg.intercept_)
print("\nR^2 Value:",las_reg.score(X,Y))

----------------------------------------------------------------------------------------------------
FOR AN ORDINARY LEAST SQUARES REGRESSION:
----------------------------------------------------------------------------------------------------
Coefficients:
 [[-3.29859089e-01  7.67843024e-03 -3.91355574e-04 -6.79461791e-03
   8.52732469e-02  7.53367180e-01]]

Intercept: [-14.53525048]

R^2 Value: 0.8092552890383933
----------------------------------------------------------------------------------------------------
FOR A RIDGE REGRESSION:
----------------------------------------------------------------------------------------------------
Coefficients:
 [[-3.01912329e-01  7.23504980e-03 -4.39605739e-04 -6.79524484e-03
   8.46899991e-02  7.51637411e-01]]

Intercept: [-14.45457602]

R^2 Value: 0.8092511787858475
----------------------------------------------------------------------------------------------------
FOR A LASSO REGRESSION:
-------------------------------------------------------

 regression models all have decent fit but all could definitely be imporoved upon, lets try logistic regressions next

In [25]:
# Collect variables of interest for regression
Y = np.array(data['Fuel_Efficient?'])
X = pd.DataFrame(data[['cylinders','displacement','horsepower','weight','acceleration','model_year']])

In [29]:
# initialize each regression
log_reg = linear_model.LogisticRegression(solver='liblinear')
las_log_reg = linear_model.LogisticRegression(penalty='l1',solver='liblinear')
rid_log_reg = linear_model.LogisticRegression(C=1,penalty='l2',solver='liblinear')

# Fit trendlines to data
log_reg.fit(X,Y)
rid_log_reg.fit(X,Y)
las_log_reg.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
print('-'*100)
print('''FOR A LOGISTIC REGRESSION:''')
print('-'*100)
print('Coefficients:')
print(log_reg.coef_)
print(log_reg.intercept_)
pred_1_sklearn = log_reg.predict(X)
print('\n Crosstab')
print(pd.crosstab(pred_1_sklearn, Y))
print('\n Percentage accuracy')
print(log_reg.score(X, Y))

print('-'*100)
print('''FOR A RIDGE LOGISTIC REGRESSION:''')
print('-'*100)
print('Coefficients:')
print(rid_log_reg.coef_)
print(rid_log_reg.intercept_)
pred_2_sklearn = rid_log_reg.predict(X)
print('\n Crosstab')
print(pd.crosstab(pred_2_sklearn, Y))
print('\n Percentage accuracy')
print(rid_log_reg.score(X, Y))

print('-'*100)
print('''FOR A LASSO LOGISTIC REGRESSION:''')
print('-'*100)
print('Coefficients:')
print(las_log_reg.coef_)
print(las_log_reg.intercept_)
pred_3_sklearn = las_log_reg.predict(X)
print('\n Crosstab')
print(pd.crosstab(pred_3_sklearn, Y))
print('\n Percentage accuracy')
print(las_log_reg.score(X, Y))

----------------------------------------------------------------------------------------------------
FOR A LOGISTIC REGRESSION:
----------------------------------------------------------------------------------------------------
Coefficients:
[[-0.01312513 -0.00512465 -0.09216219 -0.00271667 -0.32946454  0.28369541]]
[-0.02052523]

 Crosstab
col_0  False  True 
row_0              
False    207     16
True      19    150

 Percentage accuracy
0.9107142857142857
----------------------------------------------------------------------------------------------------
FOR A RIDGE LOGISTIC REGRESSION:
----------------------------------------------------------------------------------------------------
Coefficients:
[[-0.01312513 -0.00512465 -0.09216219 -0.00271667 -0.32946454  0.28369541]]
[-0.02052523]

 Crosstab
col_0  False  True 
row_0              
False    207     16
True      19    150

 Percentage accuracy
0.9107142857142857
----------------------------------------------------------------

The results between the regression models are pretty comparable - most likely due to the fact that this is a relatively small dataset and that all of the regression models produce accurate results for the model.  The effects from ridge and lasso regressions would most likely be more pronounced when working with a larger dataset

In [32]:
len(X)

392