In [73]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.experimental import enable_iterative_imputer   # IterativeImputer is experimental and the API might change without any deprecation cycle. To use it, you need to explicitly import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
df = pd.read_csv("marketing_campaign.csv",delimiter=';')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [4]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

In [5]:
drop_list = ['MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases']

In [6]:
df['Monetary'] = df['MntWines'] + df['MntFruits']+df['MntMeatProducts']+df['MntFishProducts']+df['MntSweetProducts']+df['MntGoldProds']

In [7]:
df['Frequency'] = df['NumDealsPurchases']+df['NumWebPurchases']+df['NumCatalogPurchases']+df['NumStorePurchases']

In [8]:
#  calculateRFM score

In [9]:
rfm_df = df[['ID','Recency','Frequency','Monetary']]

In [10]:
rfm_df['R_rank'] = rfm_df['Recency'].rank(ascending=False)
rfm_df['F_rank'] = rfm_df['Frequency'].rank(ascending=True)
rfm_df['M_rank'] = rfm_df['Monetary'].rank(ascending=True)
 
# normalizing the rank of the customers
rfm_df['R_rank_norm'] = (rfm_df['R_rank']/rfm_df['R_rank'].max())*100
rfm_df['F_rank_norm'] = (rfm_df['F_rank']/rfm_df['F_rank'].max())*100
rfm_df['M_rank_norm'] = (rfm_df['F_rank']/rfm_df['M_rank'].max())*100
 
rfm_df.drop(columns=['R_rank', 'F_rank', 'M_rank'], inplace=True)
 
rfm_df.head()

Unnamed: 0,ID,Recency,Frequency,Monetary,R_rank_norm,F_rank_norm,M_rank_norm
0,5524,58,25,1617,40.691669,89.263393,89.283322
1,2174,38,6,27,61.351898,15.446429,15.449877
2,4141,26,21,776,73.568381,75.044643,75.061398
3,6182,26,8,53,73.568381,26.941964,26.947979
4,5324,94,19,422,5.232428,66.473214,66.488055


In [11]:
rfm_df['RFM_Score'] = 0.15*rfm_df['R_rank_norm']+0.28 * \
    rfm_df['F_rank_norm']+0.57*rfm_df['M_rank_norm']
rfm_df['RFM_Score'] *= 0.05
rfm_df = rfm_df.round(2)
rfm_df[['ID', 'RFM_Score']]

Unnamed: 0,ID,RFM_Score
0,5524,4.10
1,2174,1.12
2,4141,3.74
3,6182,1.70
4,5324,2.86
...,...,...
2235,10870,3.06
2236,4001,3.69
2237,7270,2.89
2238,8235,4.23


In [12]:
rfm_df["Customer_segment"] = np.where(rfm_df['RFM_Score'] >
                                      4.5, "Top Customers",
                                      (np.where(
                                        rfm_df['RFM_Score'] > 4,
                                        "High value Customer",
                                        (np.where(
    rfm_df['RFM_Score'] > 3,
                             "Medium Value Customer",
                             np.where(rfm_df['RFM_Score'] > 1.6,
                            'Low Value Customers', 'Lost Customers'))))))
rfm_df[['ID', 'RFM_Score', 'Customer_segment']].head(20)

Unnamed: 0,ID,RFM_Score,Customer_segment
0,5524,4.1,High value Customer
1,2174,1.12,Lost Customers
2,4141,3.74,Medium Value Customer
3,6182,1.7,Low Value Customers
4,5324,2.86,Low Value Customers
5,7446,4.0,Medium Value Customer
6,965,3.68,Medium Value Customer
7,6177,2.01,Low Value Customers
8,4855,1.26,Lost Customers
9,5899,0.25,Lost Customers


In [13]:
# include the RFM score to the data
# drop the other columns

In [14]:
df = df.drop(drop_list,axis=1)
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,NumWebVisitsMonth,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Monetary,Frequency
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,7,...,0,0,0,0,0,3,11,1,1617,25
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,5,...,0,0,0,0,0,3,11,0,27,6
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,4,...,0,0,0,0,0,3,11,0,776,21
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,6,...,0,0,0,0,0,3,11,0,53,8
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,5,...,0,0,0,0,0,3,11,0,422,19


In [15]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

In [16]:
df['age'] = 2014-df['Year_Birth']

In [17]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Monetary', 'Frequency', 'age'],
      dtype='object')

In [18]:
drop_list = ['Year_Birth','Dt_Customer','Z_CostContact','Z_Revenue','Recency','Frequency','Monetary']

In [19]:
df = df.drop(drop_list,axis=1)

In [20]:
rfm_df

Unnamed: 0,ID,Recency,Frequency,Monetary,R_rank_norm,F_rank_norm,M_rank_norm,RFM_Score,Customer_segment
0,5524,58,25,1617,40.69,89.26,89.28,4.10,High value Customer
1,2174,38,6,27,61.35,15.45,15.45,1.12,Lost Customers
2,4141,26,21,776,73.57,75.04,75.06,3.74,Medium Value Customer
3,6182,26,8,53,73.57,26.94,26.95,1.70,Low Value Customers
4,5324,94,19,422,5.23,66.47,66.49,2.86,Low Value Customers
...,...,...,...,...,...,...,...,...,...
2235,10870,46,18,1341,54.39,62.46,62.47,3.06,Medium Value Customer
2236,4001,56,22,444,42.94,79.26,79.28,3.69,Medium Value Customer
2237,7270,91,19,1241,8.51,66.47,66.49,2.89,Low Value Customers
2238,8235,8,23,843,91.80,83.30,83.32,4.23,High value Customer


In [21]:
df = pd.merge(df,rfm_df[['ID','RFM_Score','Customer_segment']],how='left',on='ID')

In [22]:
df

Unnamed: 0,ID,Education,Marital_Status,Income,Kidhome,Teenhome,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,age,RFM_Score,Customer_segment
0,5524,Graduation,Single,58138.0,0,0,7,0,0,0,0,0,0,1,57,4.10,High value Customer
1,2174,Graduation,Single,46344.0,1,1,5,0,0,0,0,0,0,0,60,1.12,Lost Customers
2,4141,Graduation,Together,71613.0,0,0,4,0,0,0,0,0,0,0,49,3.74,Medium Value Customer
3,6182,Graduation,Together,26646.0,1,0,6,0,0,0,0,0,0,0,30,1.70,Low Value Customers
4,5324,PhD,Married,58293.0,1,0,5,0,0,0,0,0,0,0,33,2.86,Low Value Customers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,Graduation,Married,61223.0,0,1,5,0,0,0,0,0,0,0,47,3.06,Medium Value Customer
2236,4001,PhD,Together,64014.0,2,1,7,0,0,0,1,0,0,0,68,3.69,Medium Value Customer
2237,7270,Graduation,Divorced,56981.0,0,0,6,0,1,0,0,0,0,0,33,2.89,Low Value Customers
2238,8235,Master,Together,69245.0,0,1,3,0,0,0,0,0,0,0,58,4.23,High value Customer


In [23]:
df['Childern_count'] = df['Kidhome']+df['Teenhome']

In [24]:
df = df[['ID','Education', 'Marital_Status','age','Income','Childern_count','NumWebVisitsMonth','Complain','RFM_Score','Customer_segment',
        'AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','Response']]

In [26]:
# The main objective is to train a predictive model which allows the company to maximize the profit of the next marketing campaign

In [27]:
df.isna().sum()

ID                    0
Education             0
Marital_Status        0
age                   0
Income               24
Childern_count        0
NumWebVisitsMonth     0
Complain              0
RFM_Score             0
Customer_segment      0
AcceptedCmp1          0
AcceptedCmp2          0
AcceptedCmp3          0
AcceptedCmp4          0
AcceptedCmp5          0
Response              0
dtype: int64

In [51]:
df['Education'] = df['Education'].replace('Basic','Others')
df['Education'] = df['Education'].replace('2n Cycle','Others')

In [52]:
df['Marital_Status'] = df['Marital_Status'].replace("Together","Married")
df['Marital_Status'] = df['Marital_Status'].replace("Alone","Single")
df['Marital_Status'] = df['Marital_Status'].replace(['Absurd','YOLO'],"Others")

In [53]:
df['Education'].unique()

array(['Graduation', 'PhD', 'Master', 'Others'], dtype=object)

In [54]:
df['Marital_Status'].unique()

array(['Single', 'Married', 'Divorced', 'Widow', 'Others'], dtype=object)

In [80]:
df.rename(columns={'AcceptedCmp1':'Campaign_1_Status','AcceptedCmp2':'Campaign_2_Status','AcceptedCmp3':'Campaign_3_Status',
                  'AcceptedCmp4':'Campaign_4_Status','AcceptedCmp5':'Campaign_5_Status'},inplace=True)

In [81]:
df.to_csv("Marketing_campaign_edited.csv",index=False)

In [82]:
def data_preparation(data):
    data =  data.drop(['Customer_segment','ID'],axis=1)
    num_col= data.select_dtypes(['int','float']).columns
    cat_col= data.select_dtypes(['object']).columns
    Label_encoder = LabelEncoder()
    data[cat_col[0]]= Label_encoder.fit_transform(data[cat_col[0]])
    data[cat_col[1]]= Label_encoder.fit_transform(data[cat_col[1]])
    Imputer = IterativeImputer()
    data = pd.DataFrame(Imputer.fit_transform(data),columns=data.columns)
    return data

In [83]:
model_data = data_preparation(df)

In [64]:
# 0:Graduation
# 1:Master
# 2:Others
# 3:PhD

In [65]:
# 0:Divorced
# 1:Married
# 2:Others
# 3:Single
# 4:Widow

In [84]:
model_data

Unnamed: 0,Education,Marital_Status,age,Income,Childern_count,NumWebVisitsMonth,Complain,RFM_Score,Campaign_1_Status,Campaign_2_Status,Campaign_3_Status,Campaign_4_Status,Campaign_5_Status,Response
0,0.0,3.0,57.0,58138.0,0.0,7.0,0.0,4.10,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,3.0,60.0,46344.0,2.0,5.0,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,49.0,71613.0,0.0,4.0,0.0,3.74,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,30.0,26646.0,1.0,6.0,0.0,1.70,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,1.0,33.0,58293.0,1.0,5.0,0.0,2.86,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,0.0,1.0,47.0,61223.0,1.0,5.0,0.0,3.06,0.0,0.0,0.0,0.0,0.0,0.0
2236,3.0,1.0,68.0,64014.0,3.0,7.0,0.0,3.69,1.0,0.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,33.0,56981.0,0.0,6.0,0.0,2.89,0.0,0.0,0.0,1.0,0.0,0.0
2238,1.0,1.0,58.0,69245.0,1.0,3.0,0.0,4.23,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
X = model_data.drop('Response',axis=1)
y = model_data['Response']

In [86]:
Rf_model = RandomForestClassifier(random_state=11,n_estimators=500)
Rf_model.fit(X,y)
model_data['Pred'] = Rf_model.predict(X)
print("Accuracy",accuracy_score(y,model_data['Pred']))
print("Confusion Matrix",confusion_matrix(y,model_data['Pred']))

Accuracy 0.9915178571428571
Confusion Matrix [[1902    4]
 [  15  319]]


In [87]:
model_data['Response'].value_counts()

0.0    1906
1.0     334
Name: Response, dtype: int64

In [88]:
import pickle
filename = "Campaign_Prediction_model.sav"
pickle.dump(Rf_model, open(filename, 'wb'))

In [114]:
a = df[df['Response']==0].sample(4)
b = df[df['Response']==1].sample(4)
test_df = pd.concat([a,b],axis=0)
test_df.drop(['ID','Customer_segment','Response'],axis=1,inplace=True)
test_df.to_csv("Campaign_prediction_test.csv",index=False)

In [112]:
test_df

Unnamed: 0,Education,Marital_Status,age,Income,Childern_count,NumWebVisitsMonth,Complain,RFM_Score,Campaign_1_Status,Campaign_2_Status,Campaign_3_Status,Campaign_4_Status,Campaign_5_Status
875,Graduation,Married,61,80812.0,0,2,0,2.69,1,0,1,0,1
598,Others,Married,36,26224.0,1,6,0,1.78,0,0,0,0,0
1840,PhD,Married,32,32313.0,1,9,0,1.94,0,0,0,0,0
924,Graduation,Married,28,83033.0,1,5,0,4.05,1,0,0,0,0
524,Graduation,Married,27,25545.0,1,6,0,1.99,0,0,0,0,0
1829,Master,Married,39,22669.0,1,9,0,2.02,0,0,0,0,0
1129,Others,Divorced,62,63998.0,0,4,0,4.23,0,0,0,0,0
1659,Graduation,Single,33,72066.0,0,2,0,2.8,1,0,0,0,1


In [113]:
X

Unnamed: 0,Education,Marital_Status,age,Income,Childern_count,NumWebVisitsMonth,Complain,RFM_Score,Campaign_1_Status,Campaign_2_Status,Campaign_3_Status,Campaign_4_Status,Campaign_5_Status
0,0.0,3.0,57.0,58138.0,0.0,7.0,0.0,4.10,0.0,0.0,0.0,0.0,0.0
1,0.0,3.0,60.0,46344.0,2.0,5.0,0.0,1.12,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,49.0,71613.0,0.0,4.0,0.0,3.74,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,30.0,26646.0,1.0,6.0,0.0,1.70,0.0,0.0,0.0,0.0,0.0
4,3.0,1.0,33.0,58293.0,1.0,5.0,0.0,2.86,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,0.0,1.0,47.0,61223.0,1.0,5.0,0.0,3.06,0.0,0.0,0.0,0.0,0.0
2236,3.0,1.0,68.0,64014.0,3.0,7.0,0.0,3.69,1.0,0.0,0.0,0.0,0.0
2237,0.0,0.0,33.0,56981.0,0.0,6.0,0.0,2.89,0.0,0.0,0.0,1.0,0.0
2238,1.0,1.0,58.0,69245.0,1.0,3.0,0.0,4.23,0.0,0.0,0.0,0.0,0.0
