# Lab | Comparing regression models

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df_cat = pd.read_csv('categorical.csv')
df_num = pd.read_csv('numerical.csv')
print(df_cat.shape)
print(df_num.shape)

(9129, 18)
(9129, 8)


In [4]:
df_full = pd.concat([df_cat,df_num], axis = 1)
df_full.shape

(9129, 26)

In [5]:
df_full.head()

Unnamed: 0,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size,month,year,day,week,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,other,No,Basic,Bachelor,2011-02-24,Employed,F,Suburban,Married,Corporate,Offer1,Agent,Two-Door Car,Medsize,2,2011,3,8,2763.519279,56274.0,69,32,5,0,1,384.811147
1,Arizona,No,Extended,Bachelor,2011-01-31,Unemployed,F,Suburban,Single,Personal L3,Offer3,Agent,Four-Door Car,Medsize,1,2011,0,5,6979.535903,37657.380009,94,13,42,0,8,1131.464935
2,other,No,Premium,Bachelor,2011-02-19,Employed,F,Suburban,Married,Personal L3,Offer1,Agent,Two-Door Car,Medsize,2,2011,5,7,12887.43165,48767.0,108,18,38,0,2,566.472247
3,California,No,Basic,Bachelor,2011-01-20,Unemployed,M,Suburban,Married,Corporate,Offer1,Call Center,SUV,Medsize,1,2011,3,3,7645.861827,37657.380009,106,18,65,0,7,529.881344
4,other,No,Basic,Bachelor,2011-02-03,Employed,M,Rural,Single,Personal L1,Offer1,Agent,Four-Door Car,Medsize,2,2011,3,5,2813.692575,43836.0,73,12,44,0,1,138.130879


In [6]:
# Dropping effective_to_date since we have the same info in other columns by now
df_full.drop('effective_to_date', axis = 1, inplace = True)

# 1. In this final lab, we will model our data. 
Import sklearn train_test_split and separate the data.

#### X-y Split

In [7]:
y = df_full['total_claim_amount']
X = df_full.drop(['total_claim_amount'], axis=1)

#### Train-Test-Split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Separate X_train and X_test into numerical and categorical
(X_train_cat , X_train_num , X_test_cat , X_test_num)

In [9]:
X_train_num = X_train.select_dtypes('number')
X_train_cat = X_train.select_dtypes(object)

X_test_num = X_test.select_dtypes('number')
X_test_cat = X_test.select_dtypes(object)

# 3. Use X_train_num to fit scalers. 
Transform BOTH X_train_num and X_test_num.

In [10]:
# We use MinMax Scaler
from sklearn.preprocessing import MinMaxScaler

In [11]:
# Creating transformer
transformer = MinMaxScaler().fit(X_train_num)

In [12]:
# Scaling
X_train_scaled = pd.DataFrame(transformer.transform(X_train_num), columns=X_train_num.columns)
X_test_scaled = pd.DataFrame(transformer.transform(X_test_num), columns=X_test_num.columns)

# 4. Encode the categorical variables X_train_cat and X_test_cat 
(See the hint below for encoding categorical data!!!)

## 4.1 Ordinal Encoding

In [13]:
ordinal_list = ['coverage','employmentstatus', 'location_code', 'vehicle_size']

In [14]:
# We have to seperate out the columns we use for ordinal encoding
X_train_ordin = X_train_cat[ordinal_list].copy()
X_test_ordin = X_test_cat[ordinal_list].copy()

In [15]:
# Encoding
X_train_ordin["coverage"] = X_train_ordin["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
X_test_ordin["coverage"] = X_test_ordin["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

X_train_ordin["employmentstatus"] = X_train_ordin["employmentstatus"].map({"Unemployed" : 0, "other" : 1, "Employed" : 2})
X_test_ordin["employmentstatus"] = X_test_ordin["employmentstatus"].map({"Unemployed" : 0, "other" : 1, "Employed" : 2})

X_train_ordin["location_code"] = X_train_ordin["location_code"].map({"Rural" : 0, "Suburban" : 1, "Urban" : 2})
X_test_ordin["location_code"] = X_test_ordin["location_code"].map({"Rural" : 0, "Suburban" : 1, "Urban" : 2})

X_train_ordin["vehicle_size"] = X_train_ordin["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})
X_test_ordin["vehicle_size"] = X_test_ordin["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})

In [33]:
X_train_ordin

Unnamed: 0,coverage,employmentstatus,location_code,vehicle_size
0,0,0,1,2
1,1,2,1,1
2,2,2,1,0
3,0,2,1,0
4,1,2,1,0
...,...,...,...,...
7298,1,0,1,1
7299,1,0,1,1
7300,0,2,2,1
7301,1,0,1,1


In [43]:
# X_train_ordin2 = X_train_cat[ordinal_list]['coverage'].copy()
# X_train_ordin2

In [42]:
from sklearn.preprocessing import OrdinalEncoder


enc = OrdinalEncoder(categories=[['Basic','Extended','Premium'],['Unemployed','other','Employed']])

result = enc.fit_transform(X_train_ordin2)
result

## 4.2 OneHot Encoding

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
# Getting only the columsn for onehot encoding
X_train_hot = X_train_cat.drop(ordinal_list, axis = 1).copy()
X_test_hot = X_test_cat.drop(ordinal_list, axis = 1).copy()

In [18]:
# Fitting the encoder
encoder = OneHotEncoder(drop='first', handle_unknown = 'ignore').fit(X_train_hot)
column_name = encoder.get_feature_names_out(X_train_hot.columns)

In [19]:
# Encoding
X_train_hot = pd.DataFrame(encoder.transform(X_train_hot).toarray(), columns = column_name)
X_test_hot = pd.DataFrame(encoder.transform(X_test_hot).toarray(), columns = column_name)

# 5. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

#### Concatenating our dataframes

In [20]:
display(X_train_ordin.head(1))
display(X_train_hot.head(1))

Unnamed: 0,coverage,employmentstatus,location_code,vehicle_size
4041,0,0,1,2


Unnamed: 0,state_California,state_Oregon,state_other,response_Yes,education_College,education_High School or Below,education_higher education,gender_M,marital_status_Married,marital_status_Single,policy_Personal L1,policy_Personal L2,policy_Personal L3,policy_Special,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_SUV,vehicle_class_Two-Door Car
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [21]:
# Resetting indexes because of the OneHot Encoded Dataframe
X_train_scaled = X_train_scaled.reset_index(drop = True)
X_test_scaled.reset_index(drop = True,inplace = True)

X_train_ordin.reset_index(drop = True,inplace = True)
X_test_ordin.reset_index(drop = True,inplace = True)

In [22]:
X_train_processed = pd.concat([X_train_scaled,X_train_ordin,X_train_hot], axis = 1)
X_test_processed = pd.concat([X_test_scaled,X_test_ordin,X_test_hot], axis = 1)

In [23]:
# They are all numbers
X_train_processed.dtypes

month                             float64
year                              float64
day                               float64
week                              float64
customer_lifetime_value           float64
income                            float64
monthly_premium_auto              float64
months_since_last_claim           float64
months_since_policy_inception     float64
number_of_open_complaints         float64
number_of_policies                float64
coverage                            int64
employmentstatus                    int64
location_code                       int64
vehicle_size                        int64
state_California                  float64
state_Oregon                      float64
state_other                       float64
response_Yes                      float64
education_College                 float64
education_High School or Below    float64
education_higher education        float64
gender_M                          float64
marital_status_Married            

# 6. Try a simple linear regression with all the data to see whether we are getting good results.

In [24]:
from sklearn import linear_model
lm = linear_model.LinearRegression()

In [25]:
# Fitting the model
lm.fit(X_train_processed,y_train)
# Making predictions
predictions = lm.predict(X_test_processed)

#### Assessing the model

In [26]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error

In [27]:
# Training Data
# R2-Score
predictions = lm.predict(X_train_processed)
r2score = round(r2_score(y_train, predictions), 2)
# Mean Square Error
mse= round(mean_squared_error(y_train,predictions))
# Root Mean Square Error
rmse = round(np.sqrt(mse), 2)
# Mean Absolute Error
mae = round(mean_absolute_error(y_train, predictions), 2)
# Test Data
# R2-Score
predictions = lm.predict(X_test_processed)
r2scoret = round(r2_score(y_test, predictions), 2)
# Mean Square Error
mset= round(mean_squared_error(y_test,predictions), 2)
# Root Mean Square Error
rmset = round(np.sqrt(mset), 2)
# Mean Absolute Error
maet = round(mean_absolute_error(y_test, predictions), 2)

In [28]:
display(pd.DataFrame({'ValType': ['Train', 'Test'], 'R2-Score': [r2score, r2scoret], 'MSE': [mse, mset] , 'RMSE': [rmse, rmset], 'MAE': [mae, maet]}))

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.63,30732.0,175.31,127.67
1,Test,0.62,33571.52,183.23,135.27


# 7. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

In [29]:
def model_test(modellist):
    
    for model in modellist:
        
        model.fit(X_train_processed,y_train)
        
        
        # Training Data
        # R2-Score
        predictions = model.predict(X_train_processed)
        r2score = round(r2_score(y_train, predictions), 2)
        # Mean Square Error
        mse= round(mean_squared_error(y_train,predictions))
        # Root Mean Square Error
        rmse = round(np.sqrt(mse), 2)
        # Mean Absolute Error
        mae = round(mean_absolute_error(y_train, predictions), 2)
        
        
        # Test Data
        # R2-Score
        predictions = model.predict(X_test_processed)
        r2scoret = round(r2_score(y_test, predictions), 2)
        # Mean Square Error
        mset= round(mean_squared_error(y_test,predictions), 2)
        # Root Mean Square Error
        rmset = round(np.sqrt(mset), 2)
        # Mean Absolute Error
        maet = round(mean_absolute_error(y_test, predictions), 2)
        
        
        display(model)
        display(pd.DataFrame({'ValType': ['Train', 'Test'], 'R2-Score': [r2score, r2scoret], 'MSE': [mse, mset] , 'RMSE': [rmse, rmset], 'MAE': [mae, maet]}))

In [46]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor



from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

modellist = [ 
GradientBoostingRegressor(),
             ElasticNet(),
             SGDRegressor(),
             SVR(),
             BayesianRidge(),
             KernelRidge(),
             LGBMRegressor(),
                 LinearRegression(),
                 KNeighborsRegressor(n_neighbors  = 3),
                 KNeighborsRegressor(n_neighbors  = 5),
                 KNeighborsRegressor(n_neighbors  = 10),
                 MLPRegressor()
            ]

In [47]:
print(modellist)

[GradientBoostingRegressor(), ElasticNet(), SGDRegressor(), SVR(), BayesianRidge(), KernelRidge(), LGBMRegressor(), LinearRegression(), KNeighborsRegressor(n_neighbors=3), KNeighborsRegressor(), KNeighborsRegressor(n_neighbors=10), MLPRegressor()]


# 8. Use the function to check LinearRegressor and KNeighborsRegressor.

In [48]:
model_test(modellist)

GradientBoostingRegressor()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.86,11508.0,107.28,73.01
1,Test,0.84,14212.89,119.22,79.59


ElasticNet()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.32,56498.0,237.69,164.9
1,Test,0.3,61174.08,247.33,172.11


SGDRegressor()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.63,31025.0,176.14,127.93
1,Test,0.61,34091.9,184.64,135.6


SVR()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.26,61920.0,248.84,164.4
1,Test,0.24,66439.86,257.76,169.92


BayesianRidge()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.63,30736.0,175.32,127.66
1,Test,0.62,33600.57,183.3,135.25


KernelRidge()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.6,32976.0,181.59,133.42
1,Test,0.59,35500.59,188.42,140.85


LGBMRegressor()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.91,7349.0,85.73,59.47
1,Test,0.85,13485.01,116.12,76.87


LinearRegression()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.63,30732.0,175.31,127.67
1,Test,0.62,33571.52,183.23,135.27


KNeighborsRegressor(n_neighbors=3)

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.79,17564.0,132.53,80.71
1,Test,0.51,42563.3,206.31,123.26


KNeighborsRegressor()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.73,22143.0,148.81,91.39
1,Test,0.57,37456.72,193.54,117.53


KNeighborsRegressor(n_neighbors=10)

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.67,27113.0,164.66,102.78
1,Test,0.59,36197.39,190.26,119.02




MLPRegressor()

Unnamed: 0,ValType,R2-Score,MSE,RMSE,MAE
0,Train,0.72,22953.0,151.5,101.53
1,Test,0.71,25373.89,159.29,106.9
