In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
customer_df = pd.read_csv('customer_value_analysis.csv')
customer_df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [3]:
# 1.Define X and Y
X = customer_df.drop(['Total Claim Amount','Customer'],axis=1)
y = customer_df['Total Claim Amount']

In [4]:
#2. Import sklearn train_test_split

from sklearn.model_selection import train_test_split

In [5]:
#2. Set test_size=0.30 and random_state=31
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=31)

In [6]:
#3. Separate X_train and X_test into numerical and categorical (X_train_cat , X_train_num , X_test_cat , X_test_num)

X_train_num = X_train.select_dtypes(include=['number'])
X_test_num = X_test.select_dtypes(include=['number'])

X_train_num.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
8444,8105.81483,17920,72,14,53,0,2
2891,4462.997006,0,73,35,52,3,5
5464,5627.297887,61486,70,2,65,0,3
7432,7472.672033,44385,97,6,31,0,7
3919,4595.524548,95550,116,15,75,1,1


In [7]:
X_train_cat = X_train.select_dtypes(include=['object'])
X_test_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,State,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
8444,California,No,Basic,High School or Below,2/27/11,Retired,M,Suburban,Married,Personal Auto,Personal L1,Offer1,Branch,Four-Door Car,Medsize
2891,California,Yes,Basic,Bachelor,1/30/11,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer1,Web,Two-Door Car,Medsize
5464,Oregon,No,Basic,Doctor,1/21/11,Employed,F,Urban,Married,Personal Auto,Personal L3,Offer2,Call Center,Two-Door Car,Medsize
7432,Oregon,No,Extended,High School or Below,1/14/11,Employed,M,Suburban,Single,Corporate Auto,Corporate L3,Offer4,Call Center,Four-Door Car,Small
3919,California,No,Basic,College,1/25/11,Employed,M,Urban,Divorced,Personal Auto,Personal L3,Offer1,Agent,SUV,Medsize


In [8]:
# 4.Encode the categorical variables X_train_cat and X_test_cat using the OneHotEncoder setup in the previous lab

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_train_cat)
encoder.fit(X_test_cat)

In [9]:
# Remember to use .toarray() after .transform() to endup with a numpy array
X_train_cat_encoded = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded  = encoder.transform(X_test_cat).toarray()
X_train_cat_encoded
X_test_cat_encoded

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [20]:
# Next, cast the resulting numpy arrays into pandas DataFrames. 
# Make sure that the column names of the new dataframes are correctly setup using encoder.get_feature_names_out() and the same indexes of X_train_cat and X_test_cat

X_train_cat_encoded = pd.DataFrame(X_train_cat_encoded, columns=encoder.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_encoded = pd.DataFrame(X_test_cat_encoded, columns=encoder.get_feature_names_out(), index=X_test_cat.index)
display(X_train_cat_encoded.head())
display(X_test_cat_encoded.head())

Unnamed: 0,State_California,State_Nevada,State_Oregon,State_Washington,Response_Yes,Coverage_Extended,Coverage_Premium,Education_College,Education_Doctor,Education_High School or Below,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8444,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2891,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5464,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7432,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


Unnamed: 0,State_California,State_Nevada,State_Oregon,State_Washington,Response_Yes,Coverage_Extended,Coverage_Premium,Education_College,Education_Doctor,Education_High School or Below,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8444,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2891,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5464,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7432,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [23]:
# 5.Use X_train_num to fit a power transformer. Transform BOTH X_train_num and X_test_num

from sklearn.preprocessing import PowerTransformer
import numpy as np

In [24]:
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)

In [25]:
X_train_num_transf = power_transformer.fit_transform(X_train_num)
X_test_num_transf = power_transformer.fit_transform(X_test_num)
X_train_num_transf

array([[ 0.4990214 ,  0.02146411, -0.63325091, ...,  0.26941621,
        -0.513111  , -0.06494139],
       [-0.46796839, -1.64940692, -0.56692079, ...,  0.23634911,
         2.00999041,  1.13240694],
       [-0.06759709,  0.75486649, -0.77290196, ...,  0.65336005,
        -0.513111  ,  0.51455259],
       ...,
       [-0.40350718,  0.94200472,  0.99577228, ...,  1.30810491,
        -0.513111  , -1.14946072],
       [-1.89615214, -1.64940692, -1.44394006, ...,  1.19299763,
        -0.513111  , -1.14946072],
       [-2.13674919, -1.64940692, -1.0835907 , ..., -0.35832648,
         2.0133831 , -1.14946072]])

In [26]:
# Next, cast the resulting numpy arrays as pandas dataframes
# Make sure to set the correct columns names and to use the same indexes of X_train_num and X_test_num
# Name the final resulting dataframes as: X_train_num_transformed_df and X_test_num_transformed_df

X_train_num_transformed_df = pd.DataFrame(X_train_num_transf, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_transformed_df = pd.DataFrame(X_test_num_transf, columns=X_test_num.columns, index=X_test_num.index)

display(X_train_num_transformed_df.head())
display(X_test_num_transformed_df.head())

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
8444,0.499021,0.021464,-0.633251,0.048267,0.269416,-0.513111,-0.064941
2891,-0.467968,-1.649407,-0.566921,1.653602,0.236349,2.00999,1.132407
5464,-0.067597,0.754866,-0.772902,-1.476582,0.65336,-0.513111,0.514553
7432,0.379093,0.53586,0.542226,-0.840605,-0.509404,-0.513111,1.462547
3919,-0.415574,1.084631,1.038549,0.142059,0.957827,1.898546,-1.149461


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
8760,-0.186121,1.056143,-1.180355,-0.950802,-0.345948,-0.500351,0.535333
4901,-0.617099,0.281695,0.723148,1.136199,-0.309542,-0.500351,-1.137462
5118,0.82034,0.806508,1.278013,-0.950802,0.562103,1.955432,1.32851
1930,1.577912,-1.621779,-0.936807,0.601614,0.655476,1.955432,-0.043202
2474,-1.374351,-1.621779,-0.179682,0.168456,1.164247,-0.500351,-1.137462


In [36]:
# 6. Concat 
#  X_train_num_transformed_df and X_train_cat_encoded_df into X_train_new 
#  X_test_num_transformed_df and X_test_cat_encoded_df into X_test_new

X_train_new = pd.concat([X_train_num_transformed_df, X_train_cat_encoded],axis=1)
X_test_new = pd.concat([X_test_num_transformed_df, X_test_cat_encoded],axis=1)

display(X_train_new.head())
display(X_test_new.head()) # what is that ? I don't understand !!
# Because of this "Nan" problem, I can not run the cells starting from number 8.

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,State_California,State_Nevada,State_Oregon,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8444,0.499021,0.021464,-0.633251,0.048267,0.269416,-0.513111,-0.064941,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2891,-0.467968,-1.649407,-0.566921,1.653602,0.236349,2.00999,1.132407,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5464,-0.067597,0.754866,-0.772902,-1.476582,0.65336,-0.513111,0.514553,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7432,0.379093,0.53586,0.542226,-0.840605,-0.509404,-0.513111,1.462547,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3919,-0.415574,1.084631,1.038549,0.142059,0.957827,1.898546,-1.149461,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,State_California,State_Nevada,State_Oregon,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8760,-0.186121,1.056143,-1.180355,-0.950802,-0.345948,-0.500351,0.535333,,,,...,,,,,,,,,,
4901,-0.617099,0.281695,0.723148,1.136199,-0.309542,-0.500351,-1.137462,,,,...,,,,,,,,,,
5118,0.82034,0.806508,1.278013,-0.950802,0.562103,1.955432,1.32851,,,,...,,,,,,,,,,
1930,1.577912,-1.621779,-0.936807,0.601614,0.655476,1.955432,-0.043202,,,,...,,,,,,,,,,
2474,-1.374351,-1.621779,-0.179682,0.168456,1.164247,-0.500351,-1.137462,,,,...,,,,,,,,,,


In [None]:
# 7. Fit a MinMax scaler using X_train_new and transform X_train_new and X_test_new. 
#    Create new pandas dataframes from the resulting numpy arrays. 
#    Remember to set the correct columns names and indexes. 
#    Name the resulting dataframes as: X_train_new_scaled_df and X_test_new_scaled_df

In [17]:
from sklearn.preprocessing import MinMaxScaler

# Create and fit the scaler, then transform both X_train_new and X_test_new
scaler = MinMaxScaler().fit(X_train_new)
X_train_new_scaled = scaler.transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

In [30]:
# Create new pandas dataframes from the resulting numpy arrays
# Name the resulting dataframes as: X_train_new_scaled_df and X_test_new_scaled_df

X_train_new_scaled_df = pd.DataFrame(X_train_new, columns= X_train_new.columns, index= X_train_new.index)
X_test_new_scaled_df = pd.DataFrame(X_test_new, columns= X_test_new.columns, index= X_test_new.index)

display(X_train_new_scaled_df.head())
display(X_test_new_scaled_df.head())

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,State_California,State_Nevada,State_Oregon,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8444,0.499021,0.021464,-0.633251,0.048267,0.269416,-0.513111,-0.064941,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2891,-0.467968,-1.649407,-0.566921,1.653602,0.236349,2.00999,1.132407,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5464,-0.067597,0.754866,-0.772902,-1.476582,0.65336,-0.513111,0.514553,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7432,0.379093,0.53586,0.542226,-0.840605,-0.509404,-0.513111,1.462547,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3919,-0.415574,1.084631,1.038549,0.142059,0.957827,1.898546,-1.149461,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,State_California,State_Nevada,State_Oregon,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
8760,-0.186121,1.056143,-1.180355,-0.950802,-0.345948,-0.500351,0.535333,,,,...,,,,,,,,,,
4901,-0.617099,0.281695,0.723148,1.136199,-0.309542,-0.500351,-1.137462,,,,...,,,,,,,,,,
5118,0.82034,0.806508,1.278013,-0.950802,0.562103,1.955432,1.32851,,,,...,,,,,,,,,,
1930,1.577912,-1.621779,-0.936807,0.601614,0.655476,1.955432,-0.043202,,,,...,,,,,,,,,,
2474,-1.374351,-1.621779,-0.179682,0.168456,1.164247,-0.500351,-1.137462,,,,...,,,,,,,,,,


In [31]:
# 8. Train a simple linear regression model using X_train_new_scaled_df
# and get the predictions for the train and test sets

from sklearn.linear_model import LinearRegression

In [32]:
#The model
linear_reg = LinearRegression()

In [33]:
#train the model
linear_reg.fit(X_train_new_scaled_df, y_train) 

In [37]:
# predict on the train and test set
y_pred = linear_reg.predict(X_train_new_scaled_df)
y_pred = linear_reg.predict(X_test_new_scaled_df)

In [None]:
# 9. Create a function that given a model prediction and real values returns a pandas dataframe

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
def calculate_error_metrics(y_true, y_pred):
    
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    
    # Create a DataFrame
    error_metrics_df = pd.DataFrame({'Error Metric': ['MAE', 'MSE', 'RMSE', 'MAPE', 'R2'],
        'Value': [mae, mse, rmse, mape, r2]
    })
    
    return error_metrics_df

# Example usage:
# Assuming y_true and y_pred are your actual and predicted values
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([2.5, 0.0, 2, 8])

result_df = calculate_error_metrics(y_true, y_pred)
print(result_df)

In [None]:
#10. Evaluate the linear model predictions using the previous function on the TRAIN and TEST sets

In [None]:
y_pred = linear_reg.predict(X_train_new_scaled_df)
y_pred = linear_reg.predict(X_test_new_scaled_df)

evaluate_predictions_df = calculate_error_metrics(y_pred, y_pred)
print(evaluate_predictions_df)

11. Now define a function that takes as an input: list of models, X_train and y_train to train several model (with default values) so we can train a lot of them without repeating code. The function must return the list of trained models.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

In [None]:
def train_models(models, X_train, y_train):
    trained_models = []

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Scale the features using MinMaxScaler
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    for model in models:
        current_model = model

        # Train the model on the training set
        current_model.fit(X_train_scaled, y_train)

        # Append the trained model to the list
        trained_models.append(current_model)

    return trained_models

# Example usage:
# Assuming X_train and y_train are your training data
models_to_train = [LinearRegression(), RandomForestRegressor(), SVR()]
trained_models = train_models(models_to_train, X_train, y_train)


12. Use the function to train the following models (with default settings):
*LinearRegressor, *KNeighborsRegressor, *MLPRegressor

In [None]:
trained_models = train_models(LinearRegression(), KNeighborsRegressor(), MLPRegressor(), X_train, y_train)

13. Evaluate the models with the function created earlier in the TRAIN and TEST sets. Which model performs best with the default options?