In [360]:
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime

In [361]:
df = pd.read_excel("marketing_data.xlsx")

In [362]:
df.dtypes

ID                       int64
Year_Birth               int64
Education               object
Marital_Status          object
Income                 float64
Kidhome                  int64
Teenhome                 int64
Dt_Customer             object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
Response                 int64
Complain                 int64
dtype: object

In [363]:
df.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Response                0
Complain                0
dtype: int64

In [364]:
df.sample(20)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain
791,1987,1987,Basic,Single,21063.0,1,0,9/25/13,34,1,...,19,3,15,2,2,0,3,6,0,0
1860,10675,1956,PhD,Married,66334.0,0,1,2013-03-04 00:00:00,82,909,...,0,0,23,2,9,3,5,5,1,0
947,1340,1986,Graduation,Married,92910.0,0,0,4/24/14,42,551,...,179,103,33,1,6,7,13,1,0,0
562,9909,1996,2n Cycle,Married,7500.0,0,0,2012-09-11 00:00:00,24,3,...,15,22,50,3,3,1,3,9,1,0
1846,4910,1967,Graduation,Divorced,68743.0,0,0,8/30/12,81,1132,...,175,134,115,1,11,5,13,7,0,0
149,9323,1949,Master,Together,49912.0,0,1,2012-07-09 00:00:00,5,520,...,32,49,42,4,10,5,7,8,1,0
564,793,1973,Graduation,Married,20895.0,1,0,2012-06-10 00:00:00,24,18,...,3,0,4,1,2,0,3,9,0,0
207,7373,1952,PhD,Divorced,46610.0,0,2,10/29/12,8,96,...,33,22,43,6,4,1,6,6,1,0
1827,7732,1978,Graduation,Married,64813.0,1,0,2014-02-01 00:00:00,81,293,...,179,97,52,1,7,2,10,5,0,0
781,6864,1989,Master,Divorced,10979.0,0,0,5/22/14,34,8,...,2,2,4,2,3,0,3,5,0,0


In [365]:
# mean fill the null values 
mean_income = df['Income'].mean()
df['Income'] = df['Income'].fillna(mean_income)

In [366]:
#combine amount spent
df['MntSpent'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] 
+ df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']

0       518
1        44
2        47
3         0
4        45
       ... 
2235    173
2236     27
2237     34
2238    377
2239    332
Length: 2240, dtype: int64

In [367]:
#combine number of purchases without deals
df['NumPurchases'] = df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases']

In [368]:
#convert birthyear into Age
current_year = datetime.now().year
df['Age'] = current_year - df['Year_Birth']

In [369]:
#convert Dt_Customer into numerical value
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%m/%d/%y')
current_date = pd.Timestamp.now()
df['DaysEnrolled'] = (current_date - df['Dt_Customer']).dt.days

In [372]:
df.sample(20)

Unnamed: 0,Education,Marital_Status,Income,Recency,NumDealsPurchases,Response,Complain,MntSpent,NumPurchases,Age,DaysEnrolled
164,Graduation,Married,56181.0,6,1,0,0,293,13,62,3855
2224,Graduation,Married,44794.0,99,1,0,0,61,5,58,3485
64,Master,Divorced,53367.0,2,7,1,0,376,14,50,3825
921,PhD,Married,18169.0,40,1,0,0,14,3,39,4018
1549,Graduation,Together,35196.0,68,6,1,0,228,12,40,4116
10,2n Cycle,Married,81044.0,0,1,0,0,1011,21,77,3707
962,Graduation,Single,64849.0,42,1,1,0,1050,18,67,4116
1224,Graduation,Married,35704.0,54,3,0,0,51,6,72,3643
1508,PhD,Married,64355.0,66,2,0,0,1187,19,39,4197
1554,Master,Single,42394.0,69,1,0,0,27,4,52,3621


In [371]:
#Take important features and target
df = df.drop(['ID', 'Dt_Customer', 'Year_Birth', 'Kidhome', 'Teenhome', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'], axis=1)
df.dtypes

Education             object
Marital_Status        object
Income               float64
Recency                int64
NumDealsPurchases      int64
Response               int64
Complain               int64
MntSpent               int64
NumPurchases           int64
Age                    int64
DaysEnrolled           int64
dtype: object