In [None]:
import pandas as pd                                           # panel data, handling dataframes
pd.set_option('display.max_columns', None)

In [None]:
data=pd.read_csv('../data/marketing-customer-analysis.csv')    # import csv file
data.head()         

In [None]:
data.shape       # dataframe dimensions

In [None]:
data.columns     # columns headers

In [None]:
data.columns=[e.lower().replace(' ', '_') for e in data.columns]   # lower and replace
data.columns

In [None]:
data.info(memory_usage='deep')   # dataframe info

In [None]:
data.isna().sum()     # missing values

In [None]:
data=data.drop(columns=['unnamed:_0', 'vehicle_type', 'customer'])   # drop useless columns (no info or nan)

In [None]:
data=data.dropna()   # drop rows with nan values

In [None]:
for c in data.columns.tolist():         # know the unique values for each column
    print(c, len(data[c].unique()))

In [None]:
data.shape

In [None]:
print('Original dtype: {}\n'.format(data['effective_to_date'].dtype))   # object
data['effective_to_date']=pd.to_datetime(data['effective_to_date'])   # datetime
print('Meantime dtype: {}'.format(data['effective_to_date'].dtype))

In [None]:
print('--')
print('Min date: {}'.format(data['effective_to_date'].min()))         # from January 1st..
print('Max date: {}'.format(data['effective_to_date'].max()))         # to February 28th
print('--')

In [None]:
data['effective_to_date']=data['effective_to_date'].apply(lambda x: x.toordinal())   # you can change the type to ordinal.

print('New dtype: {}'.format(data['effective_to_date'].dtype))

In [None]:
cat_cols=[col for col in data.columns if (data[col].dtype==object)]     # categorical columns

In [None]:
print('Categorical Features:', len(cat_cols))
print('----------')
for c in cat_cols:
    print('Name: {}'.format(data[c].name))    # column name
    print('Type: {}'.format(data[c].dtype))   # column type
    print('Unique values: {}'.format(len(data[c].unique())))   # column unique values
    print(data[c].unique())
    print(((data[c].value_counts()/ sum(data[c].value_counts()))*100))   # percentage
    print('\n----------')

In [None]:
data.describe()     # stats

In [None]:
num_cols=[c for c in data.columns if (data[c].dtype!='object') and (c!='Effective To Date')]   # numerical columns


In [None]:
import matplotlib.pyplot as plt                 # visualization library
%matplotlib inline

for c in cat_cols:
    plt.figure(figsize=(10,5))
    plt.bar(data[c].unique(), data[c].value_counts())
    plt.title(c)
    plt.show();

In [None]:
import seaborn as sns                           # visualization library, extends plt
sns.set(style="white")                          # style

In [None]:
import numpy as np    # numerical python, algebra library

In [None]:
corr=data.corr()      # compute the correlation matrix


In [None]:
mask=np.triu(np.ones_like(corr, dtype=np.bool))     # generate a mask for the upper triangle

In [None]:
f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

In [None]:
cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

In [None]:
sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
            vmax=.3, center=0, square=True,
            linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
#all variables
sns.pairplot(data[num_cols]);

In [None]:
for c in num_cols:
    plt.figure(figsize=(10,5))
    plt.hist(data[c])
    plt.title(c)
    plt.show();

In [None]:
for c in num_cols:
    plt.figure(figsize=(10,5))
    plt.boxplot(data[c])
    plt.title(c)
    plt.show();

In [None]:
sns.countplot('response', data=data)
plt.ylabel('Total number of Response')
plt.show();

In [None]:
plt.figure(figsize=(8,4))
sns.countplot('response', hue='sales_channel', data=data)
plt.ylabel('Response by Sales Channel')
plt.show();

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(y='total_claim_amount' , x='response', data=data)
plt.ylabel('Response by Total Claim Amount')
plt.show();

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(y='income' , x='response', data=data)
plt.ylabel('Response by Inncome')
plt.show();

In [None]:
# e.g. 3*IQR in a column

q1=np.percentile(data['customer_lifetime_value'], 25)   # percentile 25
q3=np.percentile(data['customer_lifetime_value'], 75)   # percentile 75

iqr=q3-q1  # IQR

upper=q3+3*iqr   # upper boundary
lower=q1-3*iqr   # lower boundary

In [None]:
len(data[data['customer_lifetime_value']<lower])

In [None]:
len(data[data['customer_lifetime_value']>upper])

In [None]:
from sklearn.preprocessing import MinMaxScaler

data['effective_to_date']=MinMaxScaler().fit_transform(data['effective_to_date'].values.reshape(-1, 1))

data['effective_to_date'].head()

In [None]:
from sklearn.preprocessing import StandardScaler

num_cols

In [None]:
for c in num_cols[:-1]:   # we'll normalize all less the target column
    data[c]=StandardScaler().fit_transform(data[c].values.reshape(-1, 1))

In [None]:
data.head()

In [None]:
one_hot_data=pd.get_dummies(data[cat_cols], drop_first=True)   # one hot encoding categorical variables

one_hot_data.head()

In [None]:
data=pd.concat([data, one_hot_data], axis=1)   # concat dataframes
data.drop(columns=cat_cols, inplace=True)
data.head()

In [None]:
# first, split X-y (learning-target data)
X=data.drop(columns=['total_claim_amount'])
y=data['total_claim_amount']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split as tts

In [None]:
# train-test-split (4 sets)

X_train, X_test, y_train, y_test=tts(X, y, test_size=0.2, random_state=42)  # random state fixed sample

In [None]:
from sklearn.linear_model import LinearRegression as LinReg
linreg=LinReg()    # model
linreg.fit(X_train, y_train)   # model train
y_pred_linreg=linreg.predict(X_test)   # model prediction

In [None]:
from sklearn.linear_model import Lasso       # L1
from sklearn.linear_model import Ridge       # L2
from sklearn.linear_model import ElasticNet  # L1+L2

In [None]:
# Lasso L1

lasso=Lasso()
lasso.fit(X_train, y_train)

y_pred_lasso=lasso.predict(X_test)

In [None]:
# Ridge L2

ridge=Ridge()
ridge.fit(X_train, y_train)

y_pred_ridge=ridge.predict(X_test)

In [None]:
# ElasticNet L1+L2

elastic=ElasticNet()
elastic.fit(X_train, y_train)

y_pred_elastic=elastic.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR

rfr=RFR()
rfr.fit(X_train, y_train)

y_pred_rfr=rfr.predict(X_test)

In [None]:
from xgboost import XGBRegressor as XGBR

xgbr=XGBR()
xgbr.fit(X_train, y_train)

y_pred_xgbr=xgbr.predict(X_test)

In [None]:
from lightgbm import LGBMRegressor as LGBMR

lgbmr=LGBMR()
lgbmr.fit(X_train, y_train)

y_pred_lgbmr=lgbmr.predict(X_test)

In [None]:
models=[linreg, lasso, ridge, elastic, rfr, xgbr, lgbmr]
model_names=['linreg', 'lasso', 'ridge', 'elastic', 'rfr', 'xgbr', 'lgbmr']
preds=[y_pred_linreg, y_pred_lasso, y_pred_ridge, y_pred_elastic, y_pred_rfr, y_pred_xgbr, y_pred_lgbmr]

In [None]:
for i in range(len(models)):

    train_score=models[i].score(X_train, y_train) #R2
    test_score=models[i].score(X_test, y_test)

    print ('Model: {}, train R2: {} -- test R2: {}'.format(model_names[i], train_score, test_score))

In [None]:
from sklearn.metrics import mean_squared_error as mse

for i in range(len(models)):

    train_mse=mse(models[i].predict(X_train), y_train) #MSE
    test_mse=mse(preds[i], y_test)

    print ('Model: {}, train MSE: {} -- test MSE: {}'.format(model_names[i], train_mse, test_mse))

In [None]:
for i in range(len(models)):

    train_rmse=mse(models[i].predict(X_train), y_train)**0.5 #RMSE
    test_rmse=mse(preds[i], y_test)**0.5

    print ('Model: {}, train RMSE: {} -- test RMSE: {}'.format(model_names[i], train_rmse, test_rmse))

In [None]:
from sklearn.metrics import mean_absolute_error as mae
for i in range(len(models)):
    train_mae=mae(models[i].predict(X_train), y_train) #MAE
    test_mae=mae(preds[i], y_test)

    print ('Model: {}, train MAE: {} -- test MAE: {}'.format(model_names[i], train_mae, test_mae))

In [None]:
# 07 - Reporting

- Present results.

**Data Level**

- Drop Nan values because they are, in fact, duplicates.
- Do not drop outliers because they are just a few.

**Problem Level**

- Total claim amount has a great variance.
- We can predict the total claim amount with a 25% of error, even when R2 is high.
- We need to determinate which are the significative variables.