#Connecting to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

#Going to correct path

In [None]:
%cd drive
%cd My Drive
%cd Customer_Retail_Project
%ls

#Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder 
from sklearn.compose import ColumnTransformer 
import datetime as dt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

%matplotlib inline



#Converting CSV to DF

In [None]:
data = pd.read_csv('OnlineRetail.csv', encoding = "ISO-8859-1")
data.head()

# Dropping Null values from the main DF
 

In [None]:
data.dropna(axis = 0,subset=['Description'], inplace = True)
data.dropna(subset =['CustomerID'],axis = 0,inplace =True)

print(data.info())
print("----------------------------------------------------------------")
print("Null values in each column")
print(data.isnull().sum())

# Seperating categorical and numerical data

In [None]:
data.reset_index(inplace = True)
data.drop(['index','Country'],axis=1)
cat_data = data.loc[:,data.dtypes==np.object]
num_data = data.loc[:,data.dtypes!=np.object]

#Creating a Pair-Plot 

In [None]:
sns.pairplot(data)

# Correlation Matrix along with a heatmap

In [None]:
print("correlation matrix")
plt.figure(figsize=(20,6))
sns.heatmap(num_data.corr(),xticklabels=num_data.columns, yticklabels=num_data.columns,annot = True,square = True)

# List and number of unique values

In [None]:
for col_name in cat_data.columns:
  print("column name > ",col_name)
  print(cat_data[col_name].unique())
  print("Number of unique values > ", len(cat_data[col_name].unique())) 
  print("--------------------------------")

# Converting InvoiceID column 

In [None]:
'''Converted all Invoice numbers which start with 'C'> '3' as 3 
is not used to start any number and this will help us to convert this to
 numerical data. I did the same thing for 'A'>'1' '''

cat_data['InvoiceNo'] = cat_data['InvoiceNo'].str.replace('C','3')
cat_data['InvoiceNo'] = cat_data['InvoiceNo'].str.replace('A','1')
print(list(cat_data.InvoiceNo.unique()))
cat_data["InvoiceNo"] = cat_data["InvoiceNo"].astype(np.float64)
print(cat_data["InvoiceNo"].isnull().sum(),cat_data["InvoiceNo"].dtype)




# Converting InoiceDate

In [None]:
cat_data["InvoiceDate"]= pd.to_datetime(cat_data["InvoiceDate"])
cat_data.info()

# merging cat_data and num_data

In [None]:
new_data = pd.concat([cat_data,num_data],axis = 1)
print(new_data.head())
print(new_data.columns)

# Recency, Frequency and Monetary column generation

In [None]:
NOW = dt.datetime(2011,12,10)
rfmTable = new_data.groupby('CustomerID').agg({'InvoiceDate': lambda x: (NOW - x.max()).days,
'InvoiceNo': lambda x: len(x), 'UnitPrice': lambda x: x.sum()})
print(rfmTable.head(),rfmTable.shape)

In [None]:
rfmTable.reset_index(inplace = True)
rfmTable.columns = ['CustomerID','Recency', 'Frequency', 'Monetary']
print(rfmTable.head())

# Converting the new_data df to match rfmTable

In [None]:
left = new_data.drop_duplicates(subset=['CustomerID'])
left = left.sort_values(by='CustomerID')
# left.reset_index(inplace = True)
# left.drop('level_0',axis=1)
left.head()

#Generating the final dataframe for modelling

In [None]:
print(left.shape,rfmTable.shape)
final_df = pd.merge(left=left, right = rfmTable, how='inner',on ='CustomerID')
final_df.drop(labels=['Description','Country'],axis=1,inplace=True)

print(final_df.shape)

print(final_df.head())
print(final_df.info())

In [None]:
final_df.head()

# Train Test split

In [None]:
divide = int(final_df.shape[0]*0.8)
X_train = final_df.iloc[0:divide,7:]
X_test = final_df.iloc[(divide+1):,7:]

In [None]:
# Elbow-curve/SSD

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8,9,10,11,12]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(X_train)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
plt.plot(ssd)

# Selecting clusters = 5 from the graph above

In [None]:
kmeans = KMeans(n_clusters=5, max_iter=50, algorithm = 'auto')
kmeans.fit(X_train)
X_train['Group'] = kmeans.labels_

#Silhouette score calculation


In [None]:
silhouette_avg = silhouette_score(X_train.iloc[:,0:3], X_train.iloc[:,-1])
print("for n = {0} , silhouette score is {1}" .format(5,silhouette_avg))

# Box plots to visualize Group Distribution


In [None]:
plt.figure(figsize=(20,6))
sns.boxplot(x='Group', y='Recency', data=X_train)
plt.title("Group Vs Recent Purchases")
plt.show()

plt.figure(figsize=(20,6))
sns.boxplot(x='Group', y='Frequency', data=X_train)
plt.title("Group Vs Frequency of Purchases")
plt.show()

plt.figure(figsize=(20,6))
sns.boxplot(x='Group', y='Monetary', data=X_train)
plt.title("Group Vs Spending of total purchases")
plt.show()

#Strip  plots to visualize Total distribution

In [None]:
plt.figure(figsize=(20,6))
sns.stripplot(x='Group', y='Recency', data=X_train)
plt.title("Group Vs Recent Purchases")
plt.show()

plt.figure(figsize=(20,6))
sns.stripplot(x='Group', y='Frequency', data=X_train)
plt.title("Group Vs Frequency of Purchases")
plt.show()

plt.figure(figsize=(20,6))
sns.stripplot(x='Group', y='Monetary', data=X_train)
plt.title("Group Vs Spending of total purchases")
plt.show()

#Predicting for the test data

In [None]:
X_test['Group'] = kmeans.predict(X_test)

In [None]:
X_test.head()

#Inferences 

---



> 1. Group 0 can be ignored as they dont add much monetary value or frequency.
2.   Group 1 has made most purchases recently but their overall frequency is still less. 
3.   Group 2 frequently buys a lot of stuff with average monetary value.
4. Group 3 has not bought anything recently but they are frequent in their purchases and contribute to the monetary value.
5. Group 4 has made frequent high value purchases recently
6. More focus should be given on people in groups 2 and 4


