<div style="text-align:center">
    <img src="../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 12: Market basket analysis project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("Mall_Customers.csv")

#### EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().values.any()

#### Strorytelling - Visualization

Gender distribution

In [None]:
labels = ["Female","Male"]
size = df["Gender"].value_counts()
colors  = ["mediumorchid","cyan"]
explode = [0,0.1]
plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

Age distribution

In [None]:
plt.rcParams['figure.figsize'] = (25, 8)
sns.countplot(df['Age'], palette = 'hls')
plt.show()

Spending Score distribution 

In [None]:
plt.rcParams['figure.figsize'] = (35, 14)
plt.xticks(rotation=90)
sns.countplot(df['Spending Score (1-100)'], palette = 'magma')
plt.show()

Age vs Annual Income

In [None]:
plt.figure(figsize=(25,10))
plt.xticks(rotation=90)
sns.barplot(x = df['Age'] , y = df['Annual Income (k$)'] , palette='rocket');

Age vs Spending Score

In [None]:
ax = sns.barplot(y= "Spending Score (1-100)", x = "Age", data = df, palette=("flare"))
sns.set_context("poster")

Gender vs Spending Score

In [None]:
ax = sns.barplot(y= "Spending Score (1-100)", x = "Gender", data = df, palette=("flare"))
sns.set_context("poster")

Gender vs Annual Income

In [None]:
ax = sns.barplot(y= "Annual Income (k$)", x = "Gender", data = df, palette=("flare"))
sns.set_context("poster")

Annual Income vs Spending Score

In [None]:
ax = sns.barplot(y= "Spending Score (1-100)", x = "Annual Income (k$)", data = df, palette=("hls"))
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
sns.set(rc={'figure.figsize':(30,7)})
sns.set_context("poster")

Annual Income vs Age vs Spending Score

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

x = df['Annual Income (k$)']
y = df['Age']
z = df['Spending Score (1-100)']

# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

# plot
sc = ax.scatter(x, y, z, s=40, c=x, marker='o', cmap=cmap, alpha=1)
ax.set_xlabel('Annual Income (k$)')
ax.set_ylabel('Age')
ax.set_zlabel('Spending Score (1-100)')

# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)

Correlation Heatmap

In [None]:
plt.rcParams['figure.figsize'] = (15, 8)
sns.heatmap(df.corr(), cmap = 'hot', annot = True)
plt.show()

#### Train and test (Clustering)

In [None]:
x = df.iloc[:, [3, 4]].values
print(x.shape)

##### Elbow method

In [None]:
from sklearn.cluster import KMeans

wcss = []

for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(x)
    wcss.append(km.inertia_)
    
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()

##### K-means

In [None]:
model_kn = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = model_kn.fit_predict(x)

##### Visualise the clusters

In [None]:
plt.scatter(x[y_means == 0, 0], x[y_means == 0, 1], s = 100, c = 'red', label = 'miser')
plt.scatter(x[y_means == 1, 0], x[y_means == 1, 1], s = 100, c = 'green', label = 'general')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], s = 100, c = 'cyan', label = 'target')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 100, c = 'magenta', label = 'spendthrift')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 100, c = 'orange', label = 'careful')
plt.scatter(model_kn.cluster_centers_[:,0], model_kn.cluster_centers_[:, 1], s = 50, c = 'black' , label = 'centroid')
plt.style.use('fivethirtyeight')
plt.title('K Means Clustering', fontsize = 20)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.grid()
plt.show()