**Imports**

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sys
import plotly.express as px
import plotly as py

from collections import Counter

!pip install termcolor
from termcolor import colored

import warnings
warnings.filterwarnings("ignore")
py.offline.init_notebook_mode(connected = True)

import datetime as dt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score

**Read csv file**

In [28]:
# Read the data
df = pd.read_csv('/content/Assessment exercise dataset - orders.csv')

**Data Preparation**

In [29]:
# Keep data for Breakfast cuisine
breakfast_data = df.loc[df['cuisine'] == 'Breakfast']

# Count orders per city in order to keep cities with > 1000 orders
orders_per_city = breakfast_data.groupby('city').agg({'order_id':'count'})

# Rename column order_id to Count_of_orders
orders_per_city.rename(columns = {'order_id':'Count_of_orders'}, inplace = True)

# Sort values based on Count_of_orders
orders_per_city = orders_per_city.sort_values(by = 'Count_of_orders',ascending=False)

# Keep cities with over 1000 orders
orders_per_city = orders_per_city.loc[orders_per_city['Count_of_orders'] > 1000 ]

# Create a dataset with cities that have > 1000 orders and cuisine = Breakfast 
final_data = pd.merge(breakfast_data,orders_per_city,on='city',how='inner')

# Drop unecessary column Count_of_orders from the dataset
final_data = final_data.drop(['Count_of_orders'], axis=1)

In [None]:
# Descriptives
def summary(final_data):
    display(final_data.head())
    print('-'*100)
    display(final_data.info())
    print('-'*100)
    display(final_data.describe([0.01,0.25,0.50,0.75,0.99]))
summary(final_data)

In [None]:
# Check for Missing Values
final_data.isnull().sum() # 0 nulls

# Check for the min value of column amount
final_data.amount.min() # 0.4

# Check for wrong values
final_data.loc[final_data['amount'] <= 0 ]

# Check for the max value of column amount
final_data.amount.max() # 150.0

# Shape & info of the dataset
final_data.shape # (203389 rows, 7 columns)
final_data.info()

# Unique values for each column
def unique_counts(final_data):
   for i in final_data.columns:
       count = final_data[i].nunique()
       print(i, ": ", count)
unique_counts(final_data)

# Check for duplicates
duplicate = final_data[final_data.duplicated()]
 
print("Duplicate Rows :") # 0 duplicates
 
# Print the resultant Dataframe
duplicate

**Create Frequency Monetary (FM) Table**

In [32]:
# Calculate Frequency of orders and the Monetary value of the orders  #  -> 54496 users
Frequency_Monetary_Table = final_data.groupby('user_id').agg({'order_id':'count','amount':'sum'})

# Rename columns order_id & amount to Frequency & Monetary_Value respectively
Frequency_Monetary_Table.rename(columns = {'order_id':'Frequency','amount':'Monetary_Value'}, inplace = True)

**Manage Skewness and Scaling**

In [None]:
# The data should meet assumptions where the variables are not skewed and have the same mean and variance
plt.figure(figsize=(12,10))

# Plot Frequency distribution
plt.subplot(3, 1, 1); sns.distplot(Frequency_Monetary_Table['Frequency'])

# Plot Monetary_Value distribution
plt.subplot(3, 1, 2); sns.distplot(Frequency_Monetary_Table['Monetary_Value'])

# Show the plot
plt.show()

In [34]:
# As we can see from above, we have to transform the data, so it has a more symmetrical form 

# There are some methods that we can use to manage the skewness:

# 1. log transformation
# 2. square root transformation
# 3. box-cox transformation -> Note: We can use the transformation if and only if the variable only has positive values.

def analyze_skewness(x):
    fig, ax = plt.subplots(2, 2, figsize=(8,8))
    sns.distplot(Frequency_Monetary_Table[x], ax=ax[0,0])
    sns.distplot(np.log(Frequency_Monetary_Table[x]), ax=ax[0,1])
    sns.distplot(np.sqrt(Frequency_Monetary_Table[x]), ax=ax[1,0])
    sns.distplot(stats.boxcox(Frequency_Monetary_Table[x])[0], ax=ax[1,1])
    plt.tight_layout()
    plt.show()
    
    print('Log Transform : The skew coefficient of', Frequency_Monetary_Table[x].skew().round(2), 'to', np.log(Frequency_Monetary_Table[x]).skew().round(2))
    print('Square Root Transform : The skew coefficient of', Frequency_Monetary_Table[x].skew().round(2), 'to', np.sqrt(Frequency_Monetary_Table[x]).skew().round(2))
    print('Box-Cox Transform : The skew coefficient of', Frequency_Monetary_Table[x].skew().round(2), 'to', pd.Series(stats.boxcox(Frequency_Monetary_Table[x])[0]).skew().round(2))

In [None]:
analyze_skewness('Frequency')

In [None]:
analyze_skewness('Monetary_Value')

In [None]:
# Set the Numbers
users_fix = pd.DataFrame()
users_fix["Frequency"] = stats.boxcox(Frequency_Monetary_Table['Frequency'])[0]
users_fix["Monetary_Value"] = stats.boxcox(Frequency_Monetary_Table['Monetary_Value'])[0]
users_fix

In [None]:
# Each variable doesn’t have the same mean and variance. So we have to normalize it
scaler = StandardScaler()

# Fit and Transform The Data
scaler.fit(users_fix)
users_normalized = scaler.transform(users_fix)

# Assert that it has Mean 0 and Variance 1
print(users_normalized.mean(axis = 0).round(2)) 
print(users_normalized.std(axis = 0).round(2)) 

**K-MEANS Clustering**

In [None]:
# Clustering with K-means algorithm
plt.figure(figsize=(12,8))

K = range(1, 11)
errors = []

for k in K:
    kmeans = KMeans(n_clusters = k, random_state = 42)  # random_state <> 'None' so we can have reproducible results!
    kmeans.fit(users_normalized)
    
    errors.append(kmeans.inertia_)

plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
sns.pointplot(x=list(range(1, 11)), y=errors)
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 2, init='k-means++', random_state = 42)
kmeans.fit(users_normalized)

# To evaluate the performance of this model, we'll use a metric called the silhouette score

# Silhouette Coefficient or silhouette score is a metric used to calculate the goodness of a clustering technique. 

# Its value ranges from -1 to 1.

#  1:  Means clusters are well apart from each other and clearly distinguished.

#  0:  Means clusters are indifferent, or we can say that the distance between clusters is not significant.

# -1:  Means clusters are assigned in the wrong way.

# A higher silhouette score is indicative of a better model.

print(silhouette_score(users_normalized, kmeans.labels_, metric='euclidean'))

# silhouette_score:
# 2 clusters --> 0.5706358459718361
# 3 clusters --> 0.5319818057921492

In [41]:
# Assign clusters to Frequency_Monetary_Table
Frequency_Monetary_Table['Cluster'] = kmeans.labels_

**Interpret Cluster Result**

In [42]:
# Create a column with Cluster's values interpetation

Frequency_Monetary_Table.loc[(Frequency_Monetary_Table['Cluster'] == 0) , 'Customer Segment'] = 'Group B' 
Frequency_Monetary_Table.loc[(Frequency_Monetary_Table['Cluster'] == 1) , 'Customer Segment'] = 'Group A'

In [None]:
list1 = ['Frequency','Monetary_Value']
for i in list1:
    print(str(i)+': ')
    ax = sns.boxplot(x=Frequency_Monetary_Table[str(i)])
    plt.show()

In [44]:
MMM = Frequency_Monetary_Table.groupby('Customer Segment').agg({
    'Frequency':'mean',
    'Monetary_Value':['mean', 'count']}).round(1)

In [45]:
MMM_median = Frequency_Monetary_Table.groupby('Cluster').agg({
    'Frequency':'median',
    'Monetary_Value':['median', 'count']}).round(1)

In [None]:
# Visualize data to identify the distinct traits of customers in each segment
avg_df = Frequency_Monetary_Table.groupby(['Customer Segment'], as_index=False).mean()
for i in list1:
     sns.set_style(style='whitegrid') 
     sns.barplot(x='Customer Segment',y=str(i),data=avg_df,palette=['royalblue','red'])
     plt.show()