# Customer Analysis Round 7

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
import statsmodels as sm

In [2]:
# Get data

df = pd.read_csv('./files_for_lab/csv_files/marketing_customer_analysis.csv')

In [3]:
df.head(2)

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize


In [4]:
# Fomratting columns

def columns_normalizer(df):
    df.columns = [column.lower().replace(' ','_') for column in df.columns]
    
    return df

In [5]:
# Checking cretaed df

df = columns_normalizer(df)

df.head(2)

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize


In [6]:
# Getting numerical columns into a new df

numerical_df = df.select_dtypes(np.number)

numerical_df.shape

(9134, 8)

In [7]:
# Getting categorical columns into a new df

categorical_df = df.select_dtypes(np.object)

categorical_df.shape

(9134, 16)

In [8]:
# X y split

X = numerical_df.drop(columns=['total_claim_amount'])
y = numerical_df['total_claim_amount']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [10]:
# Linear regressión initial model

linear_regression = LinearRegression()

linear_regression.fit(X_train, y_train)

LinearRegression()

In [11]:
# Initial model validation

y_pred_train = linear_regression.predict(X_train)

y_pred_test = linear_regression.predict(X_test)

In [12]:
initial_train_score = linear_regression.score(X_train, y_train) 
initial_test_score = linear_regression.score(X_test, y_test)

print(f'Initial train score: {initial_train_score}')
print(f'Initial test score: {initial_test_score}')

Initial train score: 0.5217065876763483
Initial test score: 0.508388170168562


In [15]:
initial_train_mse = mse(y_pred_train,y_train)
initial_test_mse = mse(y_pred_test,y_test)

print(f'Initial train mean squared error: {initial_train_mse}')
print(f'Initial test mean squared error: {initial_test_mse}')

Initial train mean squared error: 40907.808600130185
Initial test mean squared error: 39766.47607320747


In [16]:
initial_train_mae = mae(y_pred_train,y_train)
initial_test_mae = mae(y_pred_test,y_test)

print(f'Initial train mean absolute error: {initial_train_mae}')
print(f'Initial test mean absolute error: {initial_test_mae}')

Initial train mean absolute error: 144.62280853347767
Initial test mean absolute error: 144.22489077505315


In [43]:
# Improving linear model
categorical_df = categorical_df.drop(columns=['customer'])

categorical_encoded_df = pd.get_dummies(categorical_df, drop_first=True)

In [44]:
categorical_encoded_df.shape

(9134, 101)

In [45]:
categorical_encoded_df.head(1)

Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,response_Yes,coverage_Extended,coverage_Premium,education_College,education_Doctor,education_High School or Below,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [46]:
X_full = pd.concat([X, categorical_encoded_df], axis = 1)

In [47]:
X_full.shape

(9134, 108)

In [48]:
X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size = 0.25, random_state = 42)

In [None]:
power_transofrmer = PowerTransformer(method='yeo-johnson', standardize=True)

In [54]:
power_transofrmer.fit(X_full_train)

X_train_power_transofrmer = power_transofrmer.transform(X_full_train)

df_X_train_power_transofrmer = pd.DataFrame(X_train_power_transofrmer, columns=X_full_train.columns)

df_X_train_power_transofrmer.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,state_California,state_Nevada,state_Oregon,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,-0.154555,0.807677,1.360867,1.65733,1.471319,-0.500344,-1.152021,1.370898,-0.333333,-0.629484,...,-0.625642,-0.492003,-0.409811,-0.142857,-0.143918,-0.497261,4.202957,-0.509563,0.64511,-0.485126
1,1.102186,-1.655812,1.806474,0.058035,0.799894,-0.500344,0.873446,-0.729449,-0.333333,1.588604,...,-0.625642,-0.492003,-0.409811,-0.142857,6.948423,-0.497261,-0.237928,-0.509563,0.64511,-0.485126
2,0.434349,0.401585,0.584358,1.188175,0.16264,1.953933,1.126458,1.370898,-0.333333,-0.629484,...,1.598358,-0.492003,-0.409811,-0.142857,-0.143918,-0.497261,-0.237928,1.962467,0.64511,-0.485126


In [55]:
X_test_power_transofrmer = power_transofrmer.transform(X_full_test)
df_X_test_power_transofrmer = pd.DataFrame(X_test_power_transofrmer, columns=X_full_train.columns)
df_X_test_power_transofrmer.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,state_California,state_Nevada,state_Oregon,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,-0.580388,0.892951,0.782412,-1.675324,0.12905,-0.500344,-1.152021,1.370898,-0.333333,-0.629484,...,-0.625642,-0.492003,-0.409811,-0.142857,-0.143918,2.011015,-0.237928,-0.509563,0.64511,-0.485126
1,-0.116343,0.148364,-0.709595,-0.698111,-1.431989,-0.500344,0.873446,-0.729449,-0.333333,1.588604,...,-0.625642,-0.492003,-0.409811,-0.142857,-0.143918,-0.497261,-0.237928,1.962467,0.64511,-0.485126
2,-0.771454,1.079498,0.420686,-1.278645,-0.254527,-0.500344,-1.152021,-0.729449,-0.333333,1.588604,...,-0.625642,-0.492003,-0.409811,-0.142857,-0.143918,-0.497261,-0.237928,1.962467,-1.550123,2.06132


In [56]:
# Linear regressión imporved model

linear_regression_power_transformer = LinearRegression()

linear_regression.fit(df_X_train_power_transofrmer, y_train)

LinearRegression()

In [57]:
y_pred_train_power_transformer = linear_regression.predict(df_X_train_power_transofrmer)

y_pred_test_power_transformer = linear_regression.predict(df_X_test_power_transofrmer)

In [58]:
improved_train_score = linear_regression.score(df_X_train_power_transofrmer, y_train) 
improved_test_score = linear_regression.score(df_X_test_power_transofrmer, y_test)

print(f'Improved train score: {improved_train_score}')
print(f'Improved test score: {improved_test_score}')

Improved train score: 0.7662744283170818
Improved test score: 0.7407299461385161


In [59]:
improved_train_mse = mse(y_pred_train_power_transformer,y_train)
improved_test_mse = mse(y_pred_test_power_transformer,y_test)

print(f'Improved train mean squared error: {improved_train_mse}')
print(f'Improved test mean squared error: {improved_test_mse}')

Improved train mean squared error: 19990.241774208145
Improved test mean squared error: 20972.352103319106


In [60]:
improved_train_mae = mae(y_pred_train_power_transformer,y_train)
improved_test_mae = mae(y_pred_test_power_transformer,y_test)

print(f'Improved train mean absolute error: {improved_train_mae}')
print(f'Improved test mean absolute error: {improved_test_mae}')

Improved train mean absolute error: 97.08688351478783
Improved test mean absolute error: 98.13592782803379
