# Applying Machine Learning Algorithms

In [1]:
#https://thecleverprogrammer.com/2021/02/08/customer-personality-analysis-with-python/
#https://towardsdatascience.com/understanding-consumer-behavior-with-the-market-basket-analysis-3d0c017e5613

In [2]:
#!pip install mlxtend

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore") 
from sklearn.preprocessing import StandardScaler, normalize

In [4]:
from datetime import date
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## 1. Loading the data

In [5]:
marketing_data = pd.read_csv("marketing_data.csv")

In [6]:
marketing_data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Total_Expenses,Total_Purchases,Total_accepted_campaign,Total_kids_home
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,3,11,1,65,1617,25,1,0
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,3,11,0,68,27,6,0,2
2,4141,1965,Graduation,Couple,71613.0,0,0,2013-08-21,26,426,...,0,0,3,11,0,57,776,21,0,0
3,6182,1984,Graduation,Couple,26646.0,1,0,2014-02-10,26,11,...,0,0,3,11,0,38,53,8,0,1
4,5324,1981,PhD,Couple,58293.0,1,0,2014-01-19,94,173,...,0,0,3,11,0,41,422,19,0,1


In [7]:
marketing_data.shape

(2236, 34)

In [8]:
type(marketing_data)

pandas.core.frame.DataFrame

## 2. Preprocessing the Data

### 2.1 Creating Age segment

In [9]:
cut_labels_Age = ['Young', 'Adult', 'Mature', 'Senior']
cut_bins = [0, 30, 45, 65, 100]

In [10]:
marketing_data['Age_group'] = pd.cut(marketing_data['Age'], bins=cut_bins, labels=cut_labels_Age)

In [11]:
marketing_data['Age_group'].head()

0    Mature
1    Senior
2    Mature
3     Adult
4     Adult
Name: Age_group, dtype: category
Categories (4, object): ['Young' < 'Adult' < 'Mature' < 'Senior']

### 2.2 Creating Income Segment 

In [12]:
cut_labels_Income = ['Low income', 'Low to medium income', 'Medium to high income', 'High income']
marketing_data['Income_group'] = pd.qcut(marketing_data['Income'], q=4, labels=cut_labels_Income)

In [13]:
marketing_data['Income_group'].head()

0    Medium to high income
1     Low to medium income
2              High income
3               Low income
4    Medium to high income
Name: Income_group, dtype: category
Categories (4, object): ['Low income' < 'Low to medium income' < 'Medium to high income' < 'High income']

### 2.3 Creating Seniority segment

In [14]:
last_date = date(2015,10, 4)
marketing_data['Seniority']=pd.to_datetime(marketing_data['Dt_Customer'], dayfirst=True,format = '%Y-%m-%d')
marketing_data['Seniority'] = pd.to_numeric(marketing_data['Seniority'].dt.date.apply(lambda x: (last_date - x)).dt.days, downcast='integer')/30

In [15]:
marketing_data['Seniority'].head()

0    37.500000
1    19.166667
2    25.800000
3    20.033333
4    20.766667
Name: Seniority, dtype: float64

In [16]:
cut_labels_Seniority = ['New customers', 'Discovering customers', 'Experienced customers', 'Old customers']
marketing_data['Seniority_group'] = pd.qcut(marketing_data['Seniority'], q=4, labels=cut_labels_Seniority)

In [17]:
marketing_data['Seniority_group'].head()

0            Old customers
1            New customers
2    Discovering customers
3            New customers
4            New customers
Name: Seniority_group, dtype: category
Categories (4, object): ['New customers' < 'Discovering customers' < 'Experienced customers' < 'Old customers']

### 2.4 Defining new segments according to the spending of customers on each product

- Low Buyer
- Frequent Buyer
- Biggest Buyer

In [18]:
cut_labels = ['Low consumer', 'Frequent consumer', 'Biggest consumer']

In [19]:
marketing_data['Wines_segment'] = pd.qcut(marketing_data['MntWines'][marketing_data['MntWines']>0],q=[0, .25, .75, 1], labels= cut_labels).astype("object")
marketing_data['Fruits_segment'] = pd.qcut(marketing_data['MntFruits'][marketing_data['MntFruits']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
marketing_data['Meat_segment'] = pd.qcut(marketing_data['MntMeatProducts'][marketing_data['MntMeatProducts']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
marketing_data['Fish_segment'] = pd.qcut(marketing_data['MntFishProducts'][marketing_data['MntFishProducts']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
marketing_data['Sweets_segment'] = pd.qcut(marketing_data['MntSweetProducts'][marketing_data['MntSweetProducts']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
marketing_data['Gold_segment'] = pd.qcut(marketing_data['MntGoldProds'][marketing_data['MntGoldProds']>0],q=[0, .25, .75, 1], labels=cut_labels).astype("object")
marketing_data.replace(np.nan, "Non consumer",inplace=True)

In [20]:
marketing_data['Wines_segment'] = ['Wines_' + str(col) for col in marketing_data['Wines_segment']]
marketing_data['Fruits_segment'] = ['Fruits_' + str(col) for col in marketing_data['Fruits_segment']]
marketing_data['Meat_segment'] = ['Meat_' + str(col) for col in marketing_data['Meat_segment']]
marketing_data['Fish_segment'] = ['Fish_' + str(col) for col in marketing_data['Fish_segment']]
marketing_data['Sweets_segment'] = ['Sweets_' + str(col) for col in marketing_data['Sweets_segment']]
marketing_data['Gold_segment'] = ['Gold_' + str(col) for col in marketing_data['Gold_segment']]

In [21]:
marketing_data.drop(columns=['Year_Birth','Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age','Total_Expenses', 'Total_Purchases', 'Total_accepted_campaign',
       'Total_kids_home','Seniority'],inplace=True)


In [22]:
marketing_data = marketing_data.astype(object)

In [23]:
marketing_data.head()

Unnamed: 0,ID,Education,Marital_Status,Age_group,Income_group,Seniority_group,Wines_segment,Fruits_segment,Meat_segment,Fish_segment,Sweets_segment,Gold_segment
0,5524,Graduation,Single,Mature,Medium to high income,Old customers,Wines_Biggest consumer,Fruits_Biggest consumer,Meat_Biggest consumer,Fish_Biggest consumer,Sweets_Biggest consumer,Gold_Biggest consumer
1,2174,Graduation,Single,Senior,Low to medium income,New customers,Wines_Low consumer,Fruits_Low consumer,Meat_Low consumer,Fish_Low consumer,Sweets_Low consumer,Gold_Low consumer
2,4141,Graduation,Couple,Mature,High income,Discovering customers,Wines_Frequent consumer,Fruits_Biggest consumer,Meat_Frequent consumer,Fish_Biggest consumer,Sweets_Frequent consumer,Gold_Frequent consumer
3,6182,Graduation,Couple,Adult,Low income,New customers,Wines_Low consumer,Fruits_Low consumer,Meat_Frequent consumer,Fish_Frequent consumer,Sweets_Low consumer,Gold_Low consumer
4,5324,PhD,Couple,Adult,Medium to high income,New customers,Wines_Frequent consumer,Fruits_Frequent consumer,Meat_Frequent consumer,Fish_Frequent consumer,Sweets_Frequent consumer,Gold_Frequent consumer


In [24]:
marketing_data.shape

(2236, 12)

## 3. Applying Apriori Algorithm

In [25]:
list_of_cols =list( marketing_data.columns)
list_of_cols

['ID',
 'Education',
 'Marital_Status',
 'Age_group',
 'Income_group',
 'Seniority_group',
 'Wines_segment',
 'Fruits_segment',
 'Meat_segment',
 'Fish_segment',
 'Sweets_segment',
 'Gold_segment']

In [26]:
item_dict = {}
   
for i,row in marketing_data.iterrows():
    order_id = row['ID']

    values = []
    for cols in range(1 , len(list_of_cols)) :
        product = row[cols]
        values.append(product)
    
    item_dict[order_id] = values               

In [27]:
# create a list of itemsets with the dictionary values
itemsets = list(item_dict.values())

In [28]:
itemsets[0]

['Graduation',
 'Single',
 'Mature',
 'Medium to high income',
 'Old customers',
 'Wines_Biggest consumer',
 'Fruits_Biggest consumer',
 'Meat_Biggest consumer',
 'Fish_Biggest consumer',
 'Sweets_Biggest consumer',
 'Gold_Biggest consumer']

In [29]:
len(itemsets)

2236

In [30]:
# one-hot-encode the list of litemsets
t_encoder = TransactionEncoder()
encoded_itemset = t_encoder.fit(itemsets).transform(itemsets)

In [31]:
# preview of one-hot-encoded dataframe
item_df = pd.DataFrame(encoded_itemset, columns=t_encoder.columns_)
item_df.head()

Unnamed: 0,2n Cycle,Adult,Basic,Couple,Discovering customers,Divorced,Experienced customers,Fish_Biggest consumer,Fish_Frequent consumer,Fish_Low consumer,...,Sweets_Biggest consumer,Sweets_Frequent consumer,Sweets_Low consumer,Sweets_Non consumer,Widow,Wines_Biggest consumer,Wines_Frequent consumer,Wines_Low consumer,Wines_Non consumer,Young
0,False,False,False,False,False,False,False,True,False,False,...,True,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,False
2,False,False,False,True,True,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,False
3,False,True,False,True,False,False,False,False,True,False,...,False,False,True,False,False,False,False,True,False,False
4,False,True,False,True,False,False,False,False,True,False,...,False,True,False,False,False,False,True,False,False,False


In [32]:
# perform the apriori algorithm to find itemsets 
itemsets_apriori = apriori(item_df, min_support=0.1, use_colnames=True)
itemsets_apriori.head(10)

Unnamed: 0,support,itemsets
0,0.25805,(Adult)
1,0.644902,(Couple)
2,0.251342,(Discovering customers)
3,0.103309,(Divorced)
4,0.250894,(Experienced customers)
5,0.206172,(Fish_Biggest consumer)
6,0.404293,(Fish_Frequent consumer)
7,0.2178,(Fish_Low consumer)
8,0.171735,(Fish_Non consumer)
9,0.202147,(Fruits_Biggest consumer)


In [33]:
# find rules with lift value greater than 1
asso_rules = association_rules(itemsets_apriori, metric='lift', min_threshold=1)
asso_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Couple),(Adult),0.644902,0.258050,0.170841,0.264910,1.026583,0.004424,1.009332
1,(Adult),(Couple),0.258050,0.644902,0.170841,0.662045,1.026583,0.004424,1.050727
2,(Adult),(Fish_Frequent consumer),0.258050,0.404293,0.106440,0.412478,1.020245,0.002112,1.013931
3,(Fish_Frequent consumer),(Adult),0.404293,0.258050,0.106440,0.263274,1.020245,0.002112,1.007091
4,(Fruits_Frequent consumer),(Adult),0.394902,0.258050,0.101968,0.258211,1.000622,0.000063,1.000216
...,...,...,...,...,...,...,...,...,...
1039,"(Sweets_Frequent consumer, Gold_Frequent consu...","(Meat_Frequent consumer, Wines_Frequent consumer)",0.231216,0.323792,0.100626,0.435203,1.344080,0.025760,1.197258
1040,(Meat_Frequent consumer),"(Wines_Frequent consumer, Sweets_Frequent cons...",0.487030,0.120304,0.100626,0.206612,1.717411,0.042034,1.108783
1041,(Wines_Frequent consumer),"(Meat_Frequent consumer, Sweets_Frequent consu...",0.497317,0.152952,0.100626,0.202338,1.322889,0.024561,1.061914
1042,(Sweets_Frequent consumer),"(Meat_Frequent consumer, Wines_Frequent consum...",0.402952,0.184258,0.100626,0.249723,1.355290,0.026379,1.087254


In [34]:
# Checking for biggest customer of wines

In [35]:
product='Wines'
segment='Biggest consumer'
target = '{\'%s_%s\'}' %(product,segment)
#target = 'Biggest consumer'
results = asso_rules[asso_rules['consequents'].astype(str).str.contains(target, na=False)].sort_values(by='confidence', ascending=False)
results.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
770,"(Meat_Biggest consumer, High income)",(Wines_Biggest consumer),0.189624,0.248211,0.119857,0.632075,2.546524,0.07279,2.043324
240,(High income),(Wines_Biggest consumer),0.25,0.248211,0.150268,0.601073,2.421622,0.088216,1.884529
272,(Meat_Biggest consumer),(Wines_Biggest consumer),0.249106,0.248211,0.144454,0.579892,2.336287,0.082624,1.789515
301,(Sweets_Biggest consumer),(Wines_Biggest consumer),0.199911,0.248211,0.101521,0.50783,2.04596,0.051901,1.527498
134,(Fruits_Biggest consumer),(Wines_Biggest consumer),0.202147,0.248211,0.101073,0.5,2.014414,0.050898,1.503578
