In [363]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


# Data Preprocessing and Cleaning

In [364]:
data = pd.read_csv('../input/customer-personality-analysis/marketing_campaign.csv',
                   delimiter='\t')


In [365]:
data.sample(10)

In [366]:
data.info()

In [367]:
data.describe()

In [368]:
# very high chance of outlier  age 128 an income 666666
data[data['Income']>600000]

In [369]:
data = data[data['Income'] < 600000]

In [370]:
data.columns

In [371]:
data.shape

In [372]:
data.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

In [373]:
data.nunique()

In [374]:
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])

In [375]:
print (data.isnull().sum())

In [376]:
#filling null values of income with median income
data['Income'] = data['Income'].fillna((data['Income'].median()))

In [377]:
data["Age"] = 2021-data["Year_Birth"]
data.drop(['Year_Birth'], axis=1, inplace=True)

In [378]:
# No likey to happen but there are chances that this data is not an outlier
data[data['Age'] > 90]

In [379]:
plt.figure(figsize=(18, 9))
sns.countplot(x='Age', data=data)
plt.title("AGE OF THE Customers", size=20)
plt.plot()

In [380]:
# Replacing the age with the age groups


def age_category(age):
    if 20 <= age <= 30:
        age = 0
    elif 30 < age <= 40:
        age = 1
    elif 40 < age <= 50:
        age = 2
    elif 50 < age <= 60:
        age = 3
    elif 60 < age <= 70:
        age = 4
    elif age > 75:
        age = 5
    return age

data.Age = data.Age.apply(age_category)
data.Age.unique()

In [381]:
data['Children'] = data['Kidhome'] + data['Teenhome']
data.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

In [382]:
data['Education'].value_counts()

In [383]:
#regrouping EDucaation as Unger Graduated, Gradudated and Post Graduagted
data.Education = data.Education.replace(['PhD', 'Master'], 'Post_Graduated')
data.Education = data.Education.replace(['Basic', '2n Cycle'], 'Under_Graduated')
data.Education = data.Education.replace(['Graduation'], 'Graduated')
data['Education'].value_counts()

In [384]:
data['Marital_Status'].value_counts()

In [385]:
# regrouping Martial status as single and not single
data.Marital_Status = data.Marital_Status.replace(['Married',
                                                  'Together'], 'Not_Single')

data.Marital_Status = data.Marital_Status.replace(['Single', 'Divorced',
                                                   'Widow', 'Alone',
                                                   'Absurd', 'YOLO'], 'Single')
data['Marital_Status'].value_counts()

In [386]:
# some income could be higher.
sns.boxplot(x='Income', data =data)

In [387]:
# Calling all the expenses on products as expense
cols = [i for i in data.columns if str(i).startswith('Mnt')]
print(cols)
data['Expense'] = data[['MntWines',
                        'MntFruits',
                        'MntMeatProducts',
                        'MntFishProducts',
                        'MntSweetProducts',
                        'MntGoldProds']].sum(axis=1)

data.drop(cols, axis=1, inplace=True)
data['Expense']

In [388]:
data.head()

In [389]:
cols_purchase = [i for i in data.columns if str(i).startswith('Num')]
data['Total_purchase'] = data[cols_purchase].sum(axis=1)
data.drop(cols_purchase, axis=1, inplace=True)

In [390]:
data.drop(['ID'],axis=1, inplace =True)
train_data = data.copy()

In [391]:
cols_response = [i for i in data.columns if str(i).startswith('Accept')]
cols_response = cols_response + ['Response']
data['Responses'] = data[cols_response].sum(axis=1)
data.drop(cols_response, axis=1, inplace=True)

# if there is any responses it would be 1 otherwise it would be 0


def encode(num):
    if num >= 1:
        ouput = 1
    else:
        ouput = 0
    return ouput


data['Responses'] = data['Responses'].apply(encode)
data['Responses'].value_counts()

In [392]:
data.head()

In [393]:
plt.figure(figsize= (15,15))
sns.heatmap(data.corr(), annot=True)

# Clustering Based on Income and Expenditure

In [394]:
df = pd.get_dummies(data)

In [395]:
df.drop(['Dt_Customer'],axis =1, inplace = True)

In [396]:
# Kmeans uses distance, therefore need to scale the data
from sklearn.preprocessing import RobustScaler
Scaled = RobustScaler().fit_transform(df)


In [397]:
Scaled = pd.DataFrame(Scaled, columns=df.columns)

In [398]:
Scaled.head()

In [399]:
# finnding the optimal value for k
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

model = KMeans(init='k-means++') #since inplace of simle initilization here we do probablitic initilization
                                # more roust to outliers
visualizer = KElbowVisualizer(model, k=10, random_state=42)

visualizer.fit(Scaled[['Income',  'Expense']])
visualizer.show()

In [400]:
model = KMeans(n_clusters=4,
               init='k-means++',
               random_state=42).fit(Scaled[['Income', 'Expense']])
preds = model.predict(Scaled[['Income', 'Expense']])
dt_Kmeans = Scaled[['Income', 'Expense']]
dt_Kmeans['Cluster'] = preds

In [401]:
plt.figure(figsize=(10, 7))
sns.scatterplot(data=dt_Kmeans, x='Income', y='Expense', hue='Cluster')
plt.xlabel('Income', fontsize=20, labelpad=20)
plt.ylabel('Total Expense', fontsize=20, labelpad=20)

# Q) What is the key business takeaway of the recommendation ?
    
> 1. **group 0: low spending & low income**
> 2. **group 1: high spending & high income**
> 3. **group 2: Average income & Average sepnding**
> 4. **group 3: high spending & low income**

> **Clusters based on the income and Expense of the customer , like high income and high Expense customer , high income and low spending customer.**

> **We can use this to do educated campaings with domian knowledge. Ex- Higher income coustomers tends to have higher expense**

In [402]:
import xgboost as xgb

In [403]:
train_data.head()

In [404]:
train_data.drop(['Dt_Customer'],axis=1,inplace=True)

In [405]:
train_data = pd.get_dummies(train_data)

In [406]:
X = train_data.drop(['AcceptedCmp1', 
                     'AcceptedCmp2',
                     'AcceptedCmp3',
                     'AcceptedCmp4',
                     'AcceptedCmp5',
                     'Response'], axis=1)
                     
Y = train_data[['AcceptedCmp1',
                'AcceptedCmp2',
                'AcceptedCmp3',
                'AcceptedCmp4',
                'AcceptedCmp5',
                'Response']]

In [407]:
# Making a multiple target varibales list
class_list = []
for i in range(len(Y.columns)):
    class_list.append(Y.iloc[:, i])

In [408]:
import shap
from sklearn.model_selection import train_test_split

shap_values_list = []
for i in range(0, len(class_list)):
    params = {'eta': 0.002,
              'max_depth': 5,
              'objective': 'binary:logistic',
              'subsample': 0.5,
              'tree_method': 'gpu_hist',
              'eval_metric': 'logloss'}
    X_train, X_test, y_train, y_test = train_test_split(X, class_list[i],
                                                        test_size=0.10,
                                                        random_state=42)
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_test = xgb.DMatrix(X_test, label=y_test)
    model_train = xgb.train(params, xgb_train, 10000, 
                            evals=[(xgb_test, "test")],
                            verbose_eval=1000)
    shap_values = shap.TreeExplainer(model_train).shap_values(X)
    shap_values_list.append(shap_values)

In [409]:
for i in range(0, len(shap_values_list)):
    print("Shap values for AcceptedCmp{} ".format(i))
    shap.summary_plot(shap_values,
                      shap_values_list[i],
                      feature_names=X.columns)

# Observation:
**Based on my model, We should focus on Recency, i.e complaints, and Expenses and Income of customers for campaigning.
Some clusters proofs that there are high paying customers who tend to have higher expenses. Based on all the findings we can run the campaign more optimally and save lots of money.
If a customer complained in the last 2 years, then there will be less chance of offer acceptance, as they are already not happy with product or service**

**This code is not production ready and can be improved more further**

**It should have two functions**.
    
    1. Function-1 
    Should include entire pipeline, from data preprocessing to making final predictions.
    It should take in raw data as input.
    It should return predictions for your input. Here the input can be a single point or a set of points.
    def final_fun_1(X):
    .....
    .....
    ..... # will use the best model that you found out with your experiments
    return predictions made on X ( Raw Data)

    2. Function-2
    Should include entire pipeline, from data preprocessing to making final predictions.
    It should take in raw data as input along with its target values.
    It should return the metric value that you are judging your models on.
    def final_fun_2(X,Y):
    .....
    .....
    ..... # will use the best model that you found out with your experiments
    return final_metric computed on X ( Raw Data) and Y (target variable)
