In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
df = pd.read_csv('Data/Marketing_data.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [42]:
#Checking for Missing values
df.isnull().count()

Customer                         9134
State                            9134
Customer Lifetime Value          9134
Response                         9134
Coverage                         9134
Education                        9134
Effective To Date                9134
EmploymentStatus                 9134
Gender                           9134
Income                           9134
Location Code                    9134
Marital Status                   9134
Monthly Premium Auto             9134
Months Since Last Claim          9134
Months Since Policy Inception    9134
Number of Open Complaints        9134
Number of Policies               9134
Policy Type                      9134
Policy                           9134
Renew Offer Type                 9134
Sales Channel                    9134
Total Claim Amount               9134
Vehicle Class                    9134
Vehicle Size                     9134
dtype: int64

So there is no missing values in the dataset

In [43]:
#Segregating the numerical and categorical features:

cols = set(df.columns)
numerical_cols = {'Customer Lifetime Value', 'Income', 'Monthly Premium Auto','Months Since Last Claim', 'Months Since Policy Inception','Number of Open Complaints', 'Number of Policies','Total Claim Amount'}
categorical_cols = cols.difference(numerical_cols)

df_numerical = df.loc[:, numerical_cols]
df_categorical = df.loc[:, categorical_cols]

In [44]:
#Printing value_counts for numerical features

for i, col in enumerate(list(numerical_cols)):
    print('{}. {}'.format(i+1,col))
    print('Number of unique values : {}'.format(df_numerical[col].value_counts().count()))
    #print(df_numerical[col].value_counts())
    print('\n')

1. Monthly Premium Auto
Number of unique values : 202


2. Total Claim Amount
Number of unique values : 5106


3. Number of Policies
Number of unique values : 9


4. Income
Number of unique values : 5694


5. Number of Open Complaints
Number of unique values : 6


6. Months Since Last Claim
Number of unique values : 36


7. Months Since Policy Inception
Number of unique values : 100


8. Customer Lifetime Value
Number of unique values : 8041




In [45]:
#Splitting numerical cols to Continuous and Discrete cols for EDA:
Continuous_cols = {'Total Claim Amount','Income','Customer Lifetime Value'}
Discrete_cols = numerical_cols.difference(Continuous_cols)

df_continuous = df_numerical.loc[:, Continuous_cols]
df_discrete = df_numerical.loc[:,Discrete_cols]

In [46]:
#Printing value_counts for categorical features

for i, col in enumerate(list(categorical_cols)):
    print('{}. {}'.format(i+1,col))
    print('Number of unique values: {}'.format(df_categorical[col].value_counts().count()))
    #print(df_categorical[col].value_counts())
    print('\n')

1. Customer
Number of unique values: 9134


2. Education
Number of unique values: 5


3. Response
Number of unique values: 2


4. Coverage
Number of unique values: 3


5. State
Number of unique values: 5


6. Vehicle Class
Number of unique values: 6


7. EmploymentStatus
Number of unique values: 5


8. Policy
Number of unique values: 9


9. Renew Offer Type
Number of unique values: 4


10. Sales Channel
Number of unique values: 4


11. Location Code
Number of unique values: 3


12. Effective To Date
Number of unique values: 59


13. Gender
Number of unique values: 2


14. Vehicle Size
Number of unique values: 3


15. Policy Type
Number of unique values: 3


16. Marital Status
Number of unique values: 3




1. Looking at the Value counts for the categorical features, we can see that 'Customer' feature is just an unique id which can be dropped.

2. The 'Effective To Date' feature is in a date format and is engineered into different features such as date, month and year. 

In [47]:
#We will be format the data based on the above points. 
df_categorical = df_categorical.drop(['Customer'], axis=1)

df_categorical['Expiry_date'] = pd.to_datetime(df_categorical['Effective To Date'])
df_categorical['Expiry_year'] = df_categorical['Expiry_date'].dt.year
df_categorical['Expiry_month'] = df_categorical['Expiry_date'].dt.month
df_categorical['Expiry_week_in_year'] = df_categorical['Expiry_date'].dt.week
df_categorical['Expiry_day_of_week'] = df_categorical['Expiry_date'].dt.dayofweek
df_categorical['Expiry_day_of_month'] = df_categorical['Expiry_date'].dt.day


  import sys


We have processed and split the main dataframe into 3 different data frames for the purpose of Exploratory Data Analysis and are as follows:

1. df_continuous
2. df_discrete
3. df_categorical

In [48]:
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [49]:
df_continuous.head()

Unnamed: 0,Total Claim Amount,Income,Customer Lifetime Value
0,384.811147,56274,2763.519279
1,1131.464935,0,6979.535903
2,566.472247,48767,12887.43165
3,529.881344,0,7645.861827
4,138.130879,43836,2813.692575


In [50]:
df_discrete.head()

Unnamed: 0,Monthly Premium Auto,Number of Policies,Number of Open Complaints,Months Since Last Claim,Months Since Policy Inception
0,69,1,0,32,5
1,94,8,0,13,42
2,108,2,0,18,38
3,106,7,0,18,65
4,73,1,0,12,44


In [51]:
df_categorical.head()

Unnamed: 0,Education,Response,Coverage,State,Vehicle Class,EmploymentStatus,Policy,Renew Offer Type,Sales Channel,Location Code,Effective To Date,Gender,Vehicle Size,Policy Type,Marital Status,Expiry_date,Expiry_year,Expiry_month,Expiry_week_in_year,Expiry_day_of_week,Expiry_day_of_month
0,Bachelor,No,Basic,Washington,Two-Door Car,Employed,Corporate L3,Offer1,Agent,Suburban,2/24/11,F,Medsize,Corporate Auto,Married,2011-02-24,2011,2,8,3,24
1,Bachelor,No,Extended,Arizona,Four-Door Car,Unemployed,Personal L3,Offer3,Agent,Suburban,1/31/11,F,Medsize,Personal Auto,Single,2011-01-31,2011,1,5,0,31
2,Bachelor,No,Premium,Nevada,Two-Door Car,Employed,Personal L3,Offer1,Agent,Suburban,2/19/11,F,Medsize,Personal Auto,Married,2011-02-19,2011,2,7,5,19
3,Bachelor,No,Basic,California,SUV,Unemployed,Corporate L2,Offer1,Call Center,Suburban,1/20/11,M,Medsize,Corporate Auto,Married,2011-01-20,2011,1,3,3,20
4,Bachelor,No,Basic,Washington,Four-Door Car,Employed,Personal L1,Offer1,Agent,Rural,2/3/11,M,Medsize,Personal Auto,Single,2011-02-03,2011,2,5,3,3


In [52]:
#Saving dataframes to csv:
df_continuous.to_csv('Data/df_continuous.csv')
df_discrete.to_csv('Data/df_discrete.csv')
df_categorical.to_csv('Data/df_categorical.csv')
