# Product Clustering for Recommendation based on Customer Insights, Profiling and Customer Satisfaction

##Data Preprocessing and feature extraction for Clustering products

In [None]:
%%bash
pip install stemming

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import string
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from stemming.porter2 import stem
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Open the file to read Amazon Product Metadata

In [None]:
df = open ('/content/drive/MyDrive/BAJAJ/GNN_datasets/amazon-meta.txt', 'r', encoding='utf-8', errors= 'ignore')

### Initialize a nested product dictionary that will hold cleaned up amazon product data. 

In [None]:
amazonProducts= {}

### Read the data from the Amazon meta information file and fill the AmazonProducts nested dictionary with features of products and extracted features of users reviews

In [None]:
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff,helpf,votes) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0,0,0)
for line in df:
    line = line.strip()
    if(line.startswith("Id")):
        Id = line[3:].strip()
        counter = 0
    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()
    elif(line.startswith("group")):
        Group = line[6:].strip()
    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())
    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()
    elif(line.startswith("reviews")):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()
    elif(line.startswith("19")):
        ls = line.split()
        votes=votes+int(ls[6].strip()) #aggreating votes number
        helpf=helpf+int(ls[8].strip()) #aggregating helpful votes number
    elif(line.startswith("20")):
        ls = line.split()
        votes=votes+int(ls[6].strip()) #aggreating votes number
        helpf=helpf+int(ls[8].strip()) #aggregating helpful votes number
    elif(line.startswith("categories")):
        ls = line.split() #aggregating categories to a list
        Categories = ' '.join((df.readline()).lower() for i in range(int(ls[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ',Categories)
        Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))
        Categories = ' '.join(stem(word) for word in Categories.split())
    elif (line==""): # write out fields to amazonProducts dictionary
        try:
            MetaData = {}
            if (ASIN != ""):
                amazonProducts[ASIN] = MetaData
            MetaData['Id'] = Id
            MetaData['Title'] = Title
            MetaData['Categories'] = ' '.join(set(Categories.split()))
            MetaData['Group'] = Group
            MetaData['SalesRank'] = int(SalesRank)
            MetaData['TotalReviews'] = int(TotalReviews)
            MetaData['AvgRating'] = float(AvgRating)
            MetaData['Helpful'] = int(helpf)
            MetaData['Votes'] = int(votes)
        except NameError:
            continue
        (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff,helpf,votes) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0,0,0)
df.close

###Converting Dictionary to Dataframe

In [None]:
df = pd.DataFrame.from_dict(amazonProducts)
df = df.T

####Unique Groups of Products Available

In [None]:
df.Group.unique()

####Checking for null data items

In [None]:
df.isnull().sum()

####Checking Distribution of Various Groups of Products

In [None]:
df['Group'].value_counts()

#### **We can conclude from above that product groups apart from Book,Music,Video and DVD other categories are very insignificant in number and we ignore them**

In [None]:
df = df[(df['Group'] == 'Book') | (df['Group'] == 'Music') | (df['Group'] == 'Video') | (df['Group'] == 'DVD')]

#### **Finding Number of Non-Helpful Votes**

In [None]:
df['NonHelpfulVotes']=df['Votes']-df['Helpful']

In [None]:
df['ASIN'] = df.index
df1 = df.set_index('Id')
df1['Helpful'] = df1['Helpful'].astype(int)

####Checking Distribution of Helpful Votes of Products to evaluate number of bins we can group them into

In [None]:
df1['Helpful'].describe()

In [None]:
df1.hist(bins=4)

In [None]:
df1=df1.reset_index()

#### Visualization of Helpful ratings over samples of 100000 datapoints to visualize the distribution in samples

In [None]:
df2 = df1[0:100000]
print(df2['Helpful'].describe())
df2 = df1[100000:200000]
print(df2['Helpful'].describe())
df2 = df1[200000:300000]
print(df2['Helpful'].describe())
df2 = df1[300000:400000]
print(df2['Helpful'].describe())
df2 = df1[400000:]
print(df2['Helpful'].describe())
df2 = df1[400000:]
df2['AvgRating']=df2['AvgRating'].astype(float)
print(df2.groupby('AvgRating').count())
print(df2.groupby('label_code').count())

#### **Categorizing Products into 4 buckets based on customer insights based on the distribution of helpful rating**

In [None]:
df1['label_code'] = pd.cut(x=df1['Helpful'], bins=[-1, 8, 35, 27355], labels=['0', '1', '2'])

####One Hot Encoding the Groups of Products

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(df1[['Group']]).toarray())
df1 = df1.join(encoder_df)

In [None]:
df1=df1.drop('Group',axis=1)

In [None]:
df1['label_code']=df1['label_code'].astype(int)

### Saving the dataframe for Future Use 

In [None]:
df1.to_csv("/content/drive/MyDrive/BAJAJ/GNN_datasets/customer_insights_recommendations.csv")

## Recommendation based on Customer Insights Cluster and Category Similarity

### In order to compare similarity between 2 products we compute similarity between category to which they belongs by finding jaccard similarity metric between 2 categories

In [None]:
def jaccard(a, b):
    a = set((str(a)).split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def prod_label_recomm(x_counter):
  # get label of asin
  df_counter = df1.loc[df1['Id'] == x_counter]
  x = int(df_counter['label_code'])
  y = set((str(df_counter['Categories'])).split())
  df_counter = df1.loc[df1['label_code'] == x]
  df_counter = df_counter.loc[df_counter['AvgRating']>=4.5]
  df_counter['score_cat_inter']= df_counter['Categories'].apply(lambda x: jaccard(x,y))
  sorted_df = df_counter.sort_values(["score_cat_inter"], ascending=False)
  return sorted_df[1:6]['Id'].tolist()


### Testing the recommendations for giving Product Id

In [None]:
X = prod_label_recomm('B00000AU3R')