## Get the data 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_insy695.csv')
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


## Observations

In [3]:
df.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount',
       'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [4]:
df.isna().sum() #No NA values 

Customer                         0
State                            0
Customer Lifetime Value          0
Response                         0
Coverage                         0
Education                        0
Effective To Date                0
EmploymentStatus                 0
Gender                           0
Income                           0
Location Code                    0
Marital Status                   0
Monthly Premium Auto             0
Months Since Last Claim          0
Months Since Policy Inception    0
Number of Open Complaints        0
Number of Policies               0
Policy Type                      0
Policy                           0
Renew Offer Type                 0
Sales Channel                    0
Total Claim Amount               0
Vehicle Class                    0
Vehicle Size                     0
dtype: int64

In [5]:
df.info() #we check the datatype of variables in the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9134 entries, 0 to 9133
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer                       9134 non-null   object 
 1   State                          9134 non-null   object 
 2   Customer Lifetime Value        9134 non-null   float64
 3   Response                       9134 non-null   object 
 4   Coverage                       9134 non-null   object 
 5   Education                      9134 non-null   object 
 6   Effective To Date              9134 non-null   object 
 7   EmploymentStatus               9134 non-null   object 
 8   Gender                         9134 non-null   object 
 9   Income                         9134 non-null   int64  
 10  Location Code                  9134 non-null   object 
 11  Marital Status                 9134 non-null   object 
 12  Monthly Premium Auto           9134 non-null   i

In [6]:
#Looking at categorical data to see their distribution in terms of count per class: 

In [7]:
df['State'].value_counts() 

California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     798
Name: State, dtype: int64

In [8]:
df['Response'].value_counts()

No     7826
Yes    1308
Name: Response, dtype: int64

In [9]:
df['Coverage'].value_counts()

Basic       5568
Extended    2742
Premium      824
Name: Coverage, dtype: int64

In [10]:
df['Education'].value_counts()

Bachelor                2748
College                 2681
High School or Below    2622
Master                   741
Doctor                   342
Name: Education, dtype: int64

In [11]:
df['Policy'].value_counts()

Personal L3     3426
Personal L2     2122
Personal L1     1240
Corporate L3    1014
Corporate L2     595
Corporate L1     359
Special L2       164
Special L3       148
Special L1        66
Name: Policy, dtype: int64

In [12]:
df['Policy Type'].value_counts()

Personal Auto     6788
Corporate Auto    1968
Special Auto       378
Name: Policy Type, dtype: int64

In [13]:
df.describe() #We can see that the fetaures are in different scales 

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
count,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0
mean,8004.940475,37657.380009,93.219291,15.097,48.064594,0.384388,2.96617,434.088794
std,6870.967608,30379.904734,34.407967,10.073257,27.905991,0.910384,2.390182,290.500092
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0,0.099007
25%,3994.251794,0.0,68.0,6.0,24.0,0.0,1.0,272.258244
50%,5780.182197,33889.5,83.0,14.0,48.0,0.0,2.0,383.945434
75%,8962.167041,62320.0,109.0,23.0,71.0,0.0,4.0,547.514839
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0,2893.239678


In [14]:
df.corr() #Correlations between categorical variables 

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
Customer Lifetime Value,1.0,0.024366,0.396262,0.011517,0.009418,-0.036343,0.021955,0.226451
Income,0.024366,1.0,-0.016665,-0.026715,-0.000875,0.006408,-0.008656,-0.355254
Monthly Premium Auto,0.396262,-0.016665,1.0,0.005026,0.020257,-0.013122,-0.011233,0.632017
Months Since Last Claim,0.011517,-0.026715,0.005026,1.0,-0.042959,0.005354,0.009136,0.007563
Months Since Policy Inception,0.009418,-0.000875,0.020257,-0.042959,1.0,-0.001158,-0.013333,0.003335
Number of Open Complaints,-0.036343,0.006408,-0.013122,0.005354,-0.001158,1.0,0.001498,-0.014241
Number of Policies,0.021955,-0.008656,-0.011233,0.009136,-0.013333,0.001498,1.0,-0.002354
Total Claim Amount,0.226451,-0.355254,0.632017,0.007563,0.003335,-0.014241,-0.002354,1.0


## Prepare the data for Machine Learning algorithms


In [29]:
df1 = df.copy()

In [30]:
#Remove irrelevant features 
df1 = df1.drop("Customer", axis = 1)
df1 = df1.drop("Effective To Date", axis = 1)

In [31]:
#We remove either policy or policy type (because highly correlated) as we learned 
df1 = df1.drop("Policy Type", axis = 1)

In [32]:
#Do some feature engineering an retrieve the month and day ??

In [33]:
#make sure there is not NA values 
df1 = df1.dropna() #We have no missing value (not imputer needed)

In [34]:
df1.columns

Index(['State', 'Customer Lifetime Value', 'Response', 'Coverage', 'Education',
       'EmploymentStatus', 'Gender', 'Income', 'Location Code',
       'Marital Status', 'Monthly Premium Auto', 'Months Since Last Claim',
       'Months Since Policy Inception', 'Number of Open Complaints',
       'Number of Policies', 'Policy', 'Renew Offer Type', 'Sales Channel',
       'Total Claim Amount', 'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [39]:
categorical = df1.select_dtypes('object').columns.to_list()

In [40]:
#Encode categorical 
for i in categorical: 
    df1 = pd.get_dummies(df1, columns=[i])

In [41]:
df1.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount,State_Arizona,State_California,...,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Large,Vehicle Size_Medsize,Vehicle Size_Small
0,2763.519279,56274,69,32,5,0,1,384.811147,0,0,...,0,0,0,0,0,0,1,0,1,0
1,6979.535903,0,94,13,42,0,8,1131.464935,1,0,...,0,1,0,0,0,0,0,0,1,0
2,12887.43165,48767,108,18,38,0,2,566.472247,0,0,...,0,0,0,0,0,0,1,0,1,0
3,7645.861827,0,106,18,65,0,7,529.881344,0,1,...,0,0,0,0,1,0,0,0,1,0
4,2813.692575,43836,73,12,44,0,1,138.130879,0,0,...,0,1,0,0,0,0,0,0,1,0


## Relationship of variables to the Customer Lifetime Value 

In [43]:
cor = df1.corr()
cor

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount,State_Arizona,State_California,...,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Large,Vehicle Size_Medsize,Vehicle Size_Small
Customer Lifetime Value,1.000000,0.024366,0.396262,0.011517,0.009418,-0.036343,0.021955,0.226451,-0.010006,-0.000137,...,-0.013499,-0.202246,0.177522,0.190285,0.175592,0.094543,-0.099036,-0.022755,0.010246,0.005708
Income,0.024366,1.000000,-0.016665,-0.026715,-0.000875,0.006408,-0.008656,-0.355254,-0.003971,-0.002351,...,0.005062,0.004707,0.005102,-0.009720,-0.019491,-0.017324,0.024617,-0.018883,0.015847,-0.003762
Monthly Premium Auto,0.396262,-0.016665,1.000000,0.005026,0.020257,-0.013122,-0.011233,0.632017,-0.018566,0.005819,...,-0.004722,-0.464553,0.465811,0.499941,0.387442,0.197032,-0.241546,-0.017340,0.013996,-0.002812
Months Since Last Claim,0.011517,-0.026715,0.005026,1.000000,-0.042959,0.005354,0.009136,0.007563,0.000023,-0.001042,...,-0.009368,0.005973,0.005106,0.002565,-0.019258,0.000585,0.008648,-0.007268,0.015296,-0.012090
Months Since Policy Inception,0.009418,-0.000875,0.020257,-0.042959,1.000000,-0.001158,-0.013333,0.003335,-0.004856,-0.010809,...,0.002433,-0.004336,0.018329,0.013408,-0.018470,0.018596,0.002551,0.000449,-0.006390,0.007047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vehicle Class_Sports Car,0.094543,-0.017324,0.197032,0.000585,0.018596,-0.013982,0.006415,0.101023,0.002209,0.010370,...,0.012198,-0.239359,-0.031885,-0.033917,-0.117025,1.000000,-0.120664,-0.001808,0.027389,-0.030299
Vehicle Class_Two-Door Car,-0.099036,0.024617,-0.241546,0.008648,0.002551,-0.002661,0.002694,-0.146712,-0.007387,-0.012186,...,-0.014276,-0.516175,-0.068760,-0.073141,-0.252363,-0.120664,1.000000,0.022785,-0.026310,0.012859
Vehicle Size_Large,-0.022755,-0.018883,-0.017340,-0.007268,0.000449,0.006855,-0.020595,-0.009391,0.000574,-0.000183,...,-0.008396,-0.002582,-0.002393,-0.002702,-0.017186,-0.001808,0.022785,1.000000,-0.523329,-0.166292
Vehicle Size_Medsize,0.010246,0.015847,0.013996,0.015296,-0.006390,-0.002714,0.006049,-0.074274,0.008166,0.009371,...,0.015734,-0.006222,-0.015639,-0.007520,0.027053,0.027389,-0.026310,-0.523329,1.000000,-0.753241


In [45]:
#observation of linear relationships with correlation coefficients 
threshold = 0.5
a = abs(cor['Customer Lifetime Value'])
result = a[a>0.1]
result

Customer Lifetime Value        1.000000
Monthly Premium Auto           0.396262
Total Claim Amount             0.226451
Coverage_Basic                 0.148086
Coverage_Premium               0.132485
Vehicle Class_Four-Door Car    0.202246
Vehicle Class_Luxury Car       0.177522
Vehicle Class_Luxury SUV       0.190285
Vehicle Class_SUV              0.175592
Name: Customer Lifetime Value, dtype: float64

## Splitting the data : train and test sets 

In [46]:
X = df1.drop(columns=["Customer Lifetime Value"])
y = df1["Customer Lifetime Value"]

In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [48]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scaled ,y,test_size=0.30,random_state=0)

In [49]:
X_train = pd.DataFrame(X_train,columns = X.columns)
X_test = pd.DataFrame(X_test,columns = X.columns)

## Feature Selection

In [50]:
#Recursive Feature Elimination Method
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rfe = RFE(rf, n_features_to_select=25)
model_l = rfe.fit(X_train, y_train)
model_l_df = pd.DataFrame(list(zip(X.columns,model_l.ranking_)), columns = ['predictor','ranking'])
model_l_df

Unnamed: 0,predictor,ranking
0,Income,1
1,Monthly Premium Auto,1
2,Months Since Last Claim,1
3,Months Since Policy Inception,1
4,Number of Open Complaints,1
...,...,...
56,Vehicle Class_Sports Car,26
57,Vehicle Class_Two-Door Car,29
58,Vehicle Size_Large,7
59,Vehicle Size_Medsize,1


In [60]:
notgood = model_l_df[model_l_df['ranking'] !=1 ]
notgood

Unnamed: 0,predictor,ranking
7,State_Arizona,4
9,State_Nevada,15
10,State_Oregon,9
13,Response_Yes,25
14,Coverage_Basic,2
15,Coverage_Extended,22
16,Coverage_Premium,28
18,Education_College,5
19,Education_Doctor,24
21,Education_Master,17


In [61]:
to_drop = notgood['predictor'].to_list()

In [64]:
#Removing the features that are useless to our model

for i in to_drop:
     X_train = X_train.drop(columns = [i])

In [66]:
for i in to_drop:
     X_test = X_test.drop(columns = [i])

## Train model : Simple Linear Regression

In [67]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [68]:
lin_reg.coef_

array([ 163.87723762, 2771.99898642,   47.90541277,  -35.64025576,
       -222.52757004,  112.73463413,  -72.54213201,   68.32674703,
         36.39169412,  180.45342837,   32.4182528 ,  194.74908294,
         40.84587026,  163.71869543,  137.17820196,   55.68353628,
         37.71947337,  240.81743603, -113.31247162, -123.17105295,
         40.75905063,   74.43692874,   89.99573925, -221.17710474,
        -21.86031926])

In [72]:
pred = model_l_df[model_l_df['ranking'] ==1 ]
predictors = pred['predictor'].to_list()

In [74]:
coefficients = [ 163.87723762, 2771.99898642,   47.90541277,  -35.64025576,
       -222.52757004,  112.73463413,  -72.54213201,   68.32674703,
         36.39169412,  180.45342837,   32.4182528 ,  194.74908294,
         40.84587026,  163.71869543,  137.17820196,   55.68353628,
         37.71947337,  240.81743603, -113.31247162, -123.17105295,
         40.75905063,   74.43692874,   89.99573925, -221.17710474,
        -21.86031926]


In [75]:
results_lr = pd.DataFrame( data = {'Predictor': predictors, 'Coefficient': coefficients})
results_lr 

Unnamed: 0,Predictor,Coefficient
0,Income,163.877238
1,Monthly Premium Auto,2771.998986
2,Months Since Last Claim,47.905413
3,Months Since Policy Inception,-35.640256
4,Number of Open Complaints,-222.52757
5,Number of Policies,112.734634
6,Total Claim Amount,-72.542132
7,State_California,68.326747
8,State_Washington,36.391694
9,Response_No,180.453428


### RMSE of training set - LR

In [76]:
from sklearn.metrics import mean_squared_error
import numpy as np 

clv_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, clv_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

6332.904116103798

## Train model : Decision Tree

In [77]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

### RMSE train set - DT

In [78]:
clv_predictions = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train, clv_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.6444511023036567e-13

It is extremely small...

In [79]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


In [80]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)

Scores: [6255.66620665 5273.64785876 4991.29790114 6076.39543908 5040.0257892
 5225.93225395 4818.22301515 5446.73912568 5045.9219495  4691.88169131]
Mean: 5286.573123042082
Standard deviation: 486.9654993775458


When checking the RMSE of the Decision tree, it is suprisingly low, which might be a sign of overfitting. However when using cross-validation, the Decision Tree doesn’t look as good as it did earlier. It still performs better  than the Linear Regression as we can see computing those same validation scores for the Linear Regression model :

In [81]:
lin_scores = cross_val_score(lin_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [6804.05052087 6818.99212198 5546.48518427 6952.59555685 6147.78437022
 6413.91508293 6035.58166956 5777.65827812 6481.0877159  6497.78853299]
Mean: 6347.593903368573
Standard deviation: 441.04996642360805


## Train Model: RandomForestRegressor.

Let's try Random Forests work by training many Decision Trees on random subsets of the features, then averaging out their predictions. Building a model on top of many other models is called Ensemble Learning, and it is often a great way to push ML algorithms even further. 

In [82]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=0)
forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [83]:
clv_predictions_RF = forest_reg.predict(X_train)
RF_mse = mean_squared_error(y_train, clv_predictions_RF)
RF_rmse = np.sqrt(RF_mse)
RF_rmse

1423.7248658192234

In [85]:
rf_scores = cross_val_score(forest_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=20)
rf_rmse_scores = np.sqrt(-rf_scores)
display_scores(rf_rmse_scores)

Scores: [4884.00405629 3671.36400663 4191.94909397 3978.91187535 3216.88390628
 3972.21689204 5143.7802309  4154.92319428 3899.85511961 3035.67160025
 3904.96165204 3834.79587287 3772.28323522 3061.9923791  3689.93515307
 3355.77902891 3771.04511277 3543.4349367  4319.71249793 3439.18313934]
Mean: 3842.1341491772423
Standard deviation: 523.0372460830348


In [98]:
# Get numerical feature importances
importance = list(forest_reg.feature_importances_)
results_rf = pd.DataFrame( data = {'Predictor': predictors, 'Importance': importance})
results_rf.sort_values('Importance',ascending=False)


Unnamed: 0,Predictor,Importance
5,Number of Policies,0.463029
1,Monthly Premium Auto,0.265097
6,Total Claim Amount,0.046671
2,Months Since Last Claim,0.04485
3,Months Since Policy Inception,0.040941
0,Income,0.034866
4,Number of Open Complaints,0.007818
11,Education_High School or Below,0.006741
12,Gender_F,0.006628
24,Vehicle Size_Medsize,0.006397


From the Cross-validation results, we can see that the Random Forest performs better than the Decision Tree and the Linear Regression. 

## Train Model: SVM

In [99]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train, y_train)
clv_predictions_SVM = svm_reg.predict(X_train)
svm_mse = mean_squared_error(y_train, clv_predictions_SVM)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

6945.542907731957

In [100]:
svm_scores = cross_val_score(svm_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-svm_scores)
display_scores(svm_rmse_scores)

Scores: [7572.87475463 7532.8884163  5894.09986778 7724.9692806  6769.97569023
 7047.22756087 6398.03040145 6104.26475644 7027.41648065 7266.85532239]
Mean: 6933.860253134463
Standard deviation: 600.7345939802528


The SVM performs quite poorly. We can therefore infer taht our best model seems to be the Random Forest; we now will be fine-tuning the model with the Hyperparameters.

# Fine-Tune The Model

Now, we have a shortlist of promising models. We now need to fine-tune them. 

In [101]:
##If it is not linear reg or svm (thus, a model with hyperparameters): 
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=0)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=20,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_



RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=30, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [102]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [105]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

6224.508621365258 {'max_features': 2, 'n_estimators': 3}
5266.451932987007 {'max_features': 2, 'n_estimators': 10}
4974.79498721649 {'max_features': 2, 'n_estimators': 30}
4930.980329906114 {'max_features': 4, 'n_estimators': 3}
4440.282547624334 {'max_features': 4, 'n_estimators': 10}
4307.961891141133 {'max_features': 4, 'n_estimators': 30}
4598.7676211211065 {'max_features': 6, 'n_estimators': 3}
4171.299431509602 {'max_features': 6, 'n_estimators': 10}
3978.9806841396753 {'max_features': 6, 'n_estimators': 30}
4709.571654044469 {'max_features': 8, 'n_estimators': 3}
4055.8547750516286 {'max_features': 8, 'n_estimators': 10}
3883.154038332216 {'max_features': 8, 'n_estimators': 30}
5860.489088342802 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
5031.929628971179 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
5312.521141125707 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
4662.375303403047 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

In this example, we obtain the best solution by setting the max_features hyperparameter to 8 and the n_estimators hyperparameter to 30. The RMSE score for this combination is 3868.



## Evaluate our system on the Test Set 

In [106]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) 
final_rmse

3564.706113941551

In [107]:
importance_f = list(final_model.feature_importances_)
results_rf_final = pd.DataFrame( data = {'Predictor': predictors, 'Importance': importance_f})
results_rf_final.sort_values('Importance',ascending=False)


Unnamed: 0,Predictor,Importance
5,Number of Policies,0.438721
1,Monthly Premium Auto,0.19227
6,Total Claim Amount,0.08615
2,Months Since Last Claim,0.047607
3,Months Since Policy Inception,0.046132
0,Income,0.043631
23,Vehicle Class_Four-Door Car,0.016075
4,Number of Open Complaints,0.010712
14,Marital Status_Married,0.009108
16,Policy_Personal L3,0.008646
