# Lab | Customer Analysis Final Round


For this lab, we still keep using the marketing_customer_analysis.csv file that you can find in the files_for_lab folder.

It's time to put it all together. Remember the previous rounds and follow the steps as shown in previous lectures.

## 01 - Problem (case study)

- Data Description.

- Goal.

In [None]:
# Data from a customer analysis of a car ensurance company.
# Based on the given data the objective is to create a prediction model to find out if the 
# Total claim amount can be predicted by other information avalible about the cosumer and how valid this prediction is

## 02 - Getting Data

Read the .csv file.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import norm
from scipy.special import inv_boxcox
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import os
import math
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

data = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
data

## 03 - Cleaning/Wrangling/EDA

- Change headers names.
- Deal with NaN values.
- Categorical Features.
- Numerical Features.
- Exploration.

In [None]:
data.shape
data.info()

In [None]:
def clean_dfheaders(df):
    df.rename(columns={'Customer':'id', 'EmploymentStatus':'employment_status'}, inplace=True)
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df

# using the 2 operations together only works when removing the 'df=' infront of the first satemment. Why?
# the first operation doesnt work at all in a function without the inplace parameter. Outside of a function it does work. Why?

clean_dfheaders(data)

data.duplicated(subset=['id']).unique()
# or data['id'].nunique()

In [None]:
# drop duplicates with drop_duplicates

In [None]:
data.set_index('id', inplace=True)

data['effective_to_date'] = pd.to_datetime(data['effective_to_date'], errors='coerce')

In [None]:
num_col = list(data.select_dtypes(include=[np.number]).columns.values)
cat_col = list(data.select_dtypes(include=[np.object]).columns.values)

for col in cat_col:
    print(data[col].unique())

In [None]:
"""
# cleaning columns examples

data['customer_lifetime_value'] = data['customer_lifetime_value'].apply(lambda x: float(str(x).replace('%', '')))

def clean_gender(x):
    if str(x).lower().startswith('m'):
        return 'M'
    elif str(x).lower().startswith('f'):
        return 'F'
    else:
        return 'O'

data['gender'] = data['gender'].apply(clean_gender)

data['state'] = data['state'].apply(lambda x: 'California' if str(x).lower().startswith('cal')
                                                 else 'Nevada' if x == 'NV'
                                                 else 'Arizona' if x == 'AZ'
                                                 else x)

"""

In [None]:
""" 
# replacing NaN values 

def clean_df(df):
    replace_dict = {
        'id': '',
        'state': 'California',
        'customer_lifetime_value': '',
        'response': 'No',
        'coverage': '',
        'education': '',
        'effective_to_date': '',
        'employment_status': '',
        'gender': '',
        'income': '',
        'location_code': '',
        'marital_status': '',
        'monthly_premium_auto': '',
        'months_since_last_claim': df['months_since_last_claim'].median(),
        'months_since_policy_inception': '',
        'number_of_open_complaints': df['number_of_open_complaints'].median(),
        'number_of_policies': '',
        'policy_type': '',
        'policy': '',
        'renew_offer_type': '',
        'sales_channel': '',
        'total_claim_amount': '',
        'vehicle_class': 'Four-Door Car',
        'vehicle_size': 'Medsize',
        'vehicle_type': 'A',  
    }
    
    for column in df.columns:
        df[column] = df[column].fillna(replace_dict[column])
        
# replacing the value with a mode() expression, e.g. 'state': data['state'].mode() doesnt'work. why?
# replacing NaN on categorical columns with mode or value generateted by random, set up in a way that the ratio of unique values stays the same

"""

In [None]:
data.describe().T

In [None]:
mask = np.zeros_like(data.corr())

mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(data.corr(), mask=mask, annot=True)
plt.show()

# remove multicolinearity > 0.9 
# > 0.75 check again with model metrics
# when removing columns, keep the ones that have the highest correlation with target

In [None]:
# modeling before any data changes to compare againt modeling after changes
# possible feature selection based on P values < 0.05

data_num = data.select_dtypes(include=[np.number])

ols1_x = data_num.drop(['total_claim_amount'], axis=1)
ols1_y = data['total_claim_amount']
ols1_x = sm.add_constant(ols1_x)

ols1_model = sm.OLS(ols1_y,ols1_x).fit()
print(ols1_model.summary())

In [None]:
lm1_x = data_num.drop(['total_claim_amount'], axis=1)
lm1_y = data['total_claim_amount'] 

lm1 = LinearRegression()
lm1_model = lm1.fit(lm1_x,lm1_y)

lm1_predictions = lm1.predict(lm1_x)
lm1_r2 = round(r2_score(lm1_y, lm1_predictions),2)
lm1_rmse = mean_squared_error(lm1_y, lm1_predictions, squared=False)
lm1_mse = mean_squared_error(lm1_y, lm1_predictions, squared=True)
lm1_mae = mean_absolute_error(lm1_y, lm1_predictions)

print("R2:", lm1_r2)
print("RMSE:", lm1_rmse)
print("MSE:", lm1_mse)
print("MAE:", lm1_mae)

In [None]:
sns.pairplot(data)

In [None]:
for col in num_col:
    sns.distplot(data[col])
    plt.show()

In [None]:
for col in num_col:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=data[col])
    plt.show()

## 04 - Processing Data

- Dealing with outliers.
- Normalization.
- Encoding Categorical Data.
- Splitting into train set and test set.

In [None]:
# checking for values <= 0 and replacing them before tranforming
for col in num_col:
    neg_val = len(data[data[col] < 0])
    zero_val = len(data[data[col] == 0])
     
    if neg_val > 0:
        print('Negative values in', col, ':', neg_val)
    elif zero_val > 0:
        print('Zeros in', col, ':', zero_val)
    else:
        print('Column', col, 'is ok.')
        
print(num_col)

In [None]:
data_t = data.copy()

In [None]:
# function for replacing values <= 0

def positive_values(df, in_columns=df.select_dtypes(np.number).columns, skip_columns=[]):
    for col in in_columns:
        if col not in skip_columns:
            df[col] = np.where(df[col] <= 0, df[col].median(), df[col])
    return df

In [None]:
# funtion for trasformation

def boxcox_transform(df, in_columns=df.select_dtypes(np.number).columns, skip_columns=[]):
    _ci = {column: None for column in in_columns}
    for column in in_columns:
        if column not in skip_columns:
            df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
            df[column] = df[column].fillna(df[column].mean())
            transformed_data, ci = stats.boxcox(df[column])
            df[column] = transformed_data
            _ci[column] = [ci] 
            plt.figure(figsize=(6,4))
            sns.distplot(df[column])
            plt.show()
    return df, _ci

In [None]:
# function to replace/remove outliers
# you can decide to remove outliers after log / boxcox transformation
# since these might take care of some outliers for you.

def replace_outliers(df, threshold=1.5, in_columns=df.select_dtypes(np.number).columns, skip_columns=[], median_repl=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column],75)
            lower = np.percentile(df[column],25)
            iqr = upper - lower
            upper_limit = upper + (threshold * iqr)
            lower_limit = lower - (threshold * iqr)
            
            if column not in median_repl:
                df.loc[df[column] > upper_limit, column] = upper_limit
                df.loc[df[column] < lower_limit, column] = lower_limit
            else:
                df.loc[df[column] > upper_limit, column] = df[column].median()
                df.loc[df[column] < lower_limit, column] = df[column].median()
    return df

In [None]:
"""
# or remove outliers
# make sure you are droping the outliers only after the feature selection,
# just so you don't end up losing rows because of outliers in a column you won't use

def remove_outliers(df, threshold=1.5, in_columns=df.select_dtypes(np.number).columns, skip_columns=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column],75)
            lower = np.percentile(df[column],25)
            iqr = upper - lower
            upper_limit = upper + (threshold * iqr)
            lower_limit = lower - (threshold * iqr)
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df
"""

In [None]:
data_t = positive_values(data_t, in_columns=['income', 'months_since_last_claim', 'customer_lifetime_value'])

In [None]:
data_t, _ci = boxcox_transform(data_t, skip_columns=['monthly_premium_auto'])
data_t

In [None]:
data_t = replace_outliers(data_t, threshold=1.5, in_columns=['customer_lifetime_value', 'monthly_premium_auto', 'number_of_policies'], 
                          median_repl=['customer_lifetime_value', 'monthly_premium_auto'])

In [None]:
# x-y split
t_num = list(data_t.select_dtypes(include=[np.number]).columns.values)
t_object = list(data_t.select_dtypes(include=[np.object]).columns.values)

t_drop = t_object + [t_num[7]] + ['effective_to_date']
x_t = data_t.drop(t_drop, axis=1)
y = data_t['total_claim_amount']

In [None]:
# Normalize and Standardize
to_normal = Normalizer().fit(x_t)
x_normalized = to_normal.transform(x_t)
data_sn = pd.DataFrame(x_normalized)

# to_standard = StandardScaler().fit(data_sn)
# x_standardized = to_standard.transform(data_sn)
# data_sn = pd.DataFrame(x_standardized)

sn_col = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies']

for idx, col in enumerate(sn_col):
    data_sn.rename(columns={idx:col}, inplace=True)

data_sn

In [None]:
x_cat = data_t.select_dtypes(include = [np.object])

for col in t_object:
    print(x_cat[col].value_counts())

In [None]:
# encode

x_1h = x_cat.drop(['coverage', 'education', 'vehicle_size'], axis=1)
data_1h = pd.get_dummies(x_1h, drop_first=True)

x_label1 = x_cat['coverage']
x_label2 = x_cat['education']
x_label3 = x_cat['vehicle_size']

le1 = LabelEncoder().fit(x_label1).transform(x_label1)
le2 = LabelEncoder().fit(x_label2).transform(x_label2)
le3 = LabelEncoder().fit(x_label3).transform(x_label3)

data_le1 = pd.DataFrame(le1)
data_le1.columns = ['coverage']
data_le2 = pd.DataFrame(le2)
data_le2.columns = ['education']
data_le3 = pd.DataFrame(le3)
data_le3.columns = ['vehicle_size']
target = data_t[['total_claim_amount','vehicle_class']]

In [None]:
x = np.concatenate([data_sn, data_1h, data_le1, data_le2, data_le3, target], axis=1)

data_x = pd.DataFrame(x)

columns_sn = list(data_sn.columns.values)
columns_1h = list(data_1h.columns.values)
columns_le1 = list(data_le1.columns.values)
columns_le2 = list(data_le2.columns.values)
columns_le3 = list(data_le3.columns.values)
target_col = list(target.columns.values)
columns_x = columns_sn + columns_1h + columns_le1 + columns_le2 + columns_le3 + target_col
data_x.columns = columns_x

data_x = data_x.drop(['vehicle_class'], axis=1)
data_x

In [None]:
# train-test-splits for models woutout and with encoded cathegorical columns
# use data_sn insted of data_t for normalized/standardized data

lm2_x = data_t.drop(['total_claim_amount'], axis=1)
lm2_x_full = data_x.drop(['total_claim_amount'], axis=1)
lm2_y = data_t['total_claim_amount']

x1_train, x1_test, y1_train, y1_test = train_test_split(lm2_x,lm2_y, test_size=0.2, random_state=7)
x2_train, x2_test, y2_train, y2_test = train_test_split(lm2_x_full,lm2_y, test_size=0.2, random_state=77)

## 05 - Modeling

Apply model.

In [None]:
lm2 = LinearRegression()
lm2.fit(x1_train, y1_train)

lm2_predictions_1 = lm2.predict(x1_test)
lm2_predictions_2 = lm2.predict(x2_test)

## 06 - Model Validation

- R2.
- MSE.
- RMSE.
- MAE.

In [None]:
lm2_r2_1 = round(r2_score(y1_test, lm2_predictions_1),2)
lm2_r2_2 = round(r2_score(y2_test, lm2_predictions_2),2)

print("Model 1 R2:", lm2_r2_1)
print("Model 2 R2:", lm2_r2_2)

In [None]:
predictions_1 = inv_boxcox(predictions_1, _ci['total_claim_amount'])
predictions_2 = inv_boxcox(predictions_2, _ci['total_claim_amount'])

In [None]:
lm2_r2_1 = round(r2_score(y1_test, lm2_predictions_1),2)
lm2_rmse_1 = mean_squared_error(y1_test, lm2_predictions_1, squared=False)
lm2_mse_1 = mean_squared_error(y1_test, lm2_predictions_1, squared=True)
lm2_mae_1 = mean_absolute_error(y1_test, lm2_predictions_1)

lm2_r2_2 = round(r2_score(y2_test, lm2_predictions_2),2)
lm2_rmse_2 = mean_squared_error(y2_test, lm2_predictions_2, squared=False)
lm2_mse_2 = mean_squared_error(y2_test, lm2_predictions_2, squared=True)
lm2_mae_2 = mean_absolute_error(y2_test, lm2_predictions_2)

print("Model 1 R2:", lm2_r2_1)
print("Model 1 RMSE:", lm2_rmse_1)
print("Model 1 MSE:", lm2_mse_1)
print("Model 1 MAE:", lm2_mae_1)

print("Model 2 R2:", lm2_r2_2)
print("Model 2 RMSE:", lm2_rmse_2)
print("Model 2 MSE:", lm2_mse_2)
print("Model 2 MAE:", lm2_mae_2)

## 07 - Reporting

Present results.

In [None]:
results_1 = pd.DataFrame()
results_1['true'] = inv_boxcox(y1_test, _ci['total_claim_amount'])
results_1['predicted'] = predictions_1
results_1['residual'] = results_1.apply(lambda x: abs(x['true'] - x['predicted']), axis=1)

results_2 = pd.DataFrame()
results_2['true'] = inv_boxcox(y2_test, _ci['total_claim_amount'])
results_2['predicted'] = predictions_2
results_2['residual'] = results_2.apply(lambda x: abs(x['true'] - x['predicted']), axis=1)

results_1
#results_2