# Customer Review Analysis For Leading woman clothing E-Commerce Company

#

### Importing packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_classif
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from wordcloud import STOPWORDS, WordCloud
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from textblob import TextBlob
import re as re
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import PorterStemmer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models


In [None]:
#pip install --upgrade openpyxl
#!pip install --upgrade bottleneck
#!pip install pyarrow

In [None]:
clothing = pd.read_excel("C:/Users/navee/OneDrive/Desktop/Data_Science_360/Case_studies/Completed/Machine_learning/NLP_E_Commerce_Platform_case_study/Womens_Clothing_Reviews_Data.xlsx")


clothing.head()

# 

# 1. Performing exploratory analysis on the data to understand the patterns

# 

In [None]:
# Getting data information to understand the data

clothing.info()

'''

Column names in the data  are not following the naming convention. We need to correct this

Columns "Category", "SubCategory2" and "SubCategory1" have some missing values. 
These columns can be dropped for the analysis as we already have huge amount of data.


Columns "Review Title" and "Review Text" also have some missing data.
We need to drop those records which have null in both "Review Title" and "Review Text" columns.
Then we need to merge these two columns to create a new column "Review"
Now we can drop these twor columns ("Review Title" and "Review Text")

We need to drop duplicate records if present

'''

In [None]:
#### Dropping duplicate records


sum(clothing.duplicated())

# No duplicate record found

In [None]:
#### Correcting the columns names

clothing.columns = clothing.columns.str.replace(' ', '_')

clothing.columns

In [None]:
#### Dropping nulls in the columns "Category", "SubCategory2" and "SubCategory1"


clothing.dropna(subset=["Category", "SubCategory2", "Subcategory1"], how='any', inplace=True)

In [None]:
#### Dropping columns 'Review_Title' and 'Review_Text' where both columns are null.


clothing.dropna(subset=['Review_Title', 'Review_Text'], how='all', inplace=True)

In [None]:
#### Mearging the data in the columns "Review_Title' and 'Review_Text" to create a new column as "Review"

clothing["Review"] = clothing['Review_Title'].fillna("") + " " + clothing['Review_Text'].fillna("")


clothing.head()

In [None]:
#### Dropping columns "Review_Title' and 'Review_Text" as now we don't need these


clothing.drop(columns=["Review_Title", "Review_Text"] , axis=0, inplace=True)


clothing.head()

In [None]:
clothing.nunique()


#### We don't need column "Product_ID" as cardinality is very high 

#### We will treat column "Customer_Age" as continous variable as this column have higher cardinality

In [None]:
#### Dropping column "Product_ID"


clothing.drop(columns="Product_ID",inplace=True)

In [None]:
#### Creating two columns "cust_age_range" and "cust_age_category" to ceate age categories for the custiomers


clothing["cust_age_range"] = pd.qcut(clothing.Customer_Age, 10)
clothing["cust_age_category"] = pd.qcut(clothing.Customer_Age, 10, labels=range(1,11))


clothing.head()

In [None]:
# Total number of avilable sub-categories

print("Total number of sub categories : ", clothing[['Category', "Subcategory1", 
                                                     "SubCategory2"]].drop_duplicates()["SubCategory2"].count())


clothing[['Category', "Subcategory1", "SubCategory2"]].sort_values(by=['Category', "Subcategory1", 
                                                                       "SubCategory2"]).drop_duplicates()

In [None]:
#UDF  
def continuous_var_summary( x ):
    
    # freq and missings
    n_total = x.shape[0]
    n_miss = x.isna().sum()
    perc_miss = n_miss * 100 / n_total
    
    # outliers - iqr
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    lc_iqr = q1 - 1.5 * iqr
    uc_iqr = q3 + 1.5 * iqr
    
    return pd.Series( [ x.dtype, x.nunique(), n_total, x.count(), n_miss, perc_miss,
                       x.sum(), x.mean(), x.std(), x.var(), 
                       lc_iqr, uc_iqr, 
                       x.min(), x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), 
                       x.quantile(0.25), x.quantile(0.5), x.quantile(0.75), 
                       x.quantile(0.90), x.quantile(0.95), x.quantile(0.99), x.max() ], 
                     
                    index = ['dtype', 'cardinality', 'n_tot', 'n', 'nmiss', 'perc_miss',
                             'sum', 'mean', 'std', 'var',
                        'lc_iqr', 'uc_iqr',
                        'min', 'p1', 'p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99', 'max']) 

In [None]:
#### For continous variable

# As we are treating column "Customer_Age" as continous variable


continuous_var_summary(clothing.loc[:,'Customer_Age'])

In [None]:
#### For categorical variable

clothing.loc[:,['Rating', 'Recommend_Flag']] = clothing.loc[:,['Rating', 'Recommend_Flag']].astype('object')



clothing[['Category', 'Subcategory1', 'SubCategory2', 'Location', 
          'Channel', 'Review', 'Rating', 'Recommend_Flag']].describe()


'''

Some of the reviews are duplicates.

Most of the ratings are 5 stars.

Website is the most used channel for the shopping.

Gurgaon is the location where most of the sales are taking place.

'''

In [None]:
#### Duplicate reviews


clothing[clothing.duplicated(subset=['Review'], keep=False)].sort_values(by='Review')

In [None]:
#### Displaying the Count of 'Recommend_Flag' Values


chart_Recommend_Flag = pd.DataFrame(clothing.Recommend_Flag.value_counts()).reset_index()
chart_Recommend_Flag.columns = ['Recommend_Flag', 'count_of_Recommend_Flag']

chart_Recommend_Flag

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Recommend_Flag.count_of_Recommend_Flag, x=chart_Recommend_Flag.Recommend_Flag, 
            data=chart_Recommend_Flag, palette='gist_earth', width=0.2)


plt.xlabel("Recommend Flag", fontsize=12)
plt.ylabel("Count of Recommend Flag ", fontsize=12)
plt.title("Count of Recommend Flag Values", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Displaying the Count of 'Rating' Values


chart_Rating = pd.DataFrame(clothing.Rating.value_counts()).reset_index()
chart_Rating.columns = ['Rating', 'count_of_Rating' ]
chart_Rating

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Rating.count_of_Rating, x=chart_Rating.Rating, 
            data=chart_Rating, palette='gist_earth', width=0.3)


plt.xlabel("Rating", fontsize=12)
plt.ylabel("Count of Rating ", fontsize=12)
plt.title("Count of Rating Values", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Displaying the Count of 'Channel' Values


chart_Channel = pd.DataFrame(clothing.Channel.value_counts()).reset_index()
chart_Channel.columns  = ['Channel', 'count_of_Channel']
chart_Channel

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Channel.count_of_Channel, x=chart_Channel.Channel, 
            data=chart_Channel, palette='gist_earth', width=0.2)


plt.xlabel("Channel", fontsize=12)
plt.ylabel("Count of Channel ", fontsize=12)
plt.title("Count of Channel Values", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Displaying the Count of 'Location' Values


chart_Location = pd.DataFrame(clothing.Location.value_counts()).reset_index()
chart_Location.columns = ['Location', 'count_of_Location']
chart_Location

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Location.count_of_Location, x=chart_Location.Location, 
            data=chart_Location, palette='gist_earth', width=0.3)


plt.xlabel("chart_Location", fontsize=12)
plt.ylabel("Count of chart_Location ", fontsize=12)
plt.title("Count of chart_Location Values", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Proportion of "Rating" 


chart_Rating_proportion = pd.DataFrame(clothing.Rating.value_counts()/clothing.Rating.count()*100).reset_index()
chart_Rating_proportion.columns = ['Rating', 'percentage_of_Rating']
chart_Rating_proportion

# Data is unbalanced 

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Rating_proportion.percentage_of_Rating, x=chart_Rating_proportion.Rating, 
            data=chart_Rating_proportion, palette='gist_earth', width=0.3)


plt.xlabel("Rating", fontsize=12)
plt.ylabel("Percentage", fontsize=12)
plt.title("Breakdown of Clothing Ratings by Percentage", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Proportion of Recommend_Flag 


chart_Recommend_Flag_proportion = pd.DataFrame(clothing.Recommend_Flag.value_counts()/clothing.
                                               Recommend_Flag.count()*100).reset_index()
chart_Recommend_Flag_proportion.columns = ['Recommend_Flag', 'percentage_of_Recommend_Flag']
chart_Recommend_Flag_proportion
# Data is unbalanced

In [None]:
# Visualization 

plt.figure(figsize=(10, 5))

sns.barplot(y=chart_Recommend_Flag_proportion.percentage_of_Recommend_Flag, x=chart_Recommend_Flag_proportion.Recommend_Flag, 
            data=chart_Recommend_Flag_proportion, palette='gist_earth', width=0.2)


plt.xlabel("Recommend Flag", fontsize=12)
plt.ylabel("Percentage ", fontsize=12)
plt.title("Breakdown of Clothing Recommend Flag by Percentage", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
# Visualization of Breakdown of Customer age by Percentage

plt.figure(figsize=(12, 5))
sns.distplot(clothing.Customer_Age, color='r')

plt.xlabel("Customer Age", fontsize=12)
plt.ylabel("Percentage ", fontsize=12)
plt.title("Breakdown of Customer age by Percentage", fontsize=15, pad=18)

plt.grid(alpha=0.7,linestyle='--', axis='x')
plt.tight_layout()
plt.show()

In [None]:
#### Visualization of percentage of count of Recommend_Flag per Rating


chart_1 = clothing.groupby(['Rating', 'Recommend_Flag']).agg({'Rating': 'count'}).rename(columns={'Rating':'Rating_count'})


chart_1 = chart_1.reset_index().pivot_table(index='Rating', columns='Recommend_Flag', values='Rating_count')


chart_1 = chart_1.div(chart_1.sum(axis=1), axis=0)


chart_1

In [None]:
chart_1.plot(kind='bar', stacked=True, width=0.4, figsize=(12,5), colormap='Pastel1')
plt.xlabel('Rating')
plt.ylabel('Percentage')
plt.title('100% Stacked Column Chart of Recommend_Flag Count by Rating', pad=20)

In [None]:
#### Visualization of percentage of count of Rating per Recommend_Flag


chart_2 = clothing.groupby(['Recommend_Flag', 'Rating']).agg({'Recommend_Flag': 'count'}).rename(columns={'Recommend_Flag':'Recommend_Flag_count'})

chart_2 = chart_2.reset_index().pivot_table(values= 'Recommend_Flag_count', columns='Rating', index='Recommend_Flag')

chart_2 = chart_2.div(chart_2.sum(axis=1), axis=0)

chart_2

In [None]:
chart_2.plot(kind='bar', stacked=True, width=0.3, figsize=(12,5), colormap='Pastel1')
plt.xlabel('Recommend_Flag')
plt.ylabel('Percentage')
plt.title('100% Stacked Column Chart of Rating Count by Recommend_Flag', pad=20)

In [None]:
#### Relationship between coluumns 'Recommend_Flag' and 'Rating'.

heatmap_data = clothing[['Recommend_Flag', 
                         'Rating']].pivot_table(index='Recommend_Flag', 
                                                columns='Rating', aggfunc=len, fill_value=0)


In [None]:
# Visualization of Relationship between Rating and Recommend_Flag 

plt.figure(figsize=(12, 5))

sns.heatmap(heatmap_data, annot=True, cmap='Pastel1', fmt='d', linewidths=.9)

plt.title('Heatmap: Relationship between Rating and Recommend_Flag', pad=18)
plt.xlabel('Rating')
plt.ylabel('Recommend_Flag')
plt.show()

## Text-EDA

In [None]:
#### Only alphabetic word count in each Review:
### Considering words like wasn't or it's as one single word (using ' in Regex)

clothing.Review.apply( lambda x: [w for w in re.split
                                  (r"[^a-zA-Z']+", x) 
                                  if len(w)>1] ).apply( lambda x: len(' '.join(x).split()) )

In [None]:
#### Count of alphanumeric words in each Review:


clothing['Review'].apply( lambda x: len(re.findall(r"([a-zA-Z]+[0-9]+|[0-9]+[a-zA-Z]+)", x))  ).head(10)

In [None]:
#### Count of numbers in each Review:


clothing['Review'].apply( lambda x: len(re.findall(r"\d+", x))  ).head(10)

In [None]:
#### Count of characters which are neither alphabetic nor numeric (like symbols and whitespaces) in each Review:


clothing["Review"].apply( lambda x: len(re.split(r"\w+", x)) )

In [None]:
#### Unique alphabetic word count in each review:


clothing.Review.apply( lambda x: [w for w in re.split
                                  (r"[^a-zA-Z']+", x) 
                                  if len(w)>1] ).apply( lambda x: len(set(' '.join(x).split())) )



In [None]:
#### Characters count in each Review


clothing['Review'].apply( lambda x: len(x) )

In [None]:
#### punctuation count of each review:


clothing['Review'].apply( lambda x: len([w for w in str(x) if w in string.punctuation]))

In [None]:
#### Stopwords in each review:

stop_words = list(set(list(ENGLISH_STOP_WORDS) + list(STOPWORDS)))


clothing["Review"].apply( lambda x: [w for w in re.split(r"[^a-zA-Z']+", x) if w in stop_words] )

# 

# 2. Perform text mining tasks to understand what most frequent words are using for positive sentiment and negative sentiment. Create word clouds for the positive & negative reviews separately.

# 

In [None]:
#### Creating a new column naming it as "polarity". It contains polarity of the review.
### Let if polarity is below 0 or 0, it will be treated as "Negative_review"
### Let if polarity is above 0, it will be treated as "Positive_review"


clothing["polarity"] = clothing["Review"].apply( lambda x: TextBlob(str(x)).polarity )


clothing["polarity"]

In [None]:
clothing["sentiment"] = np.where(clothing["polarity"] > 0, 'Positive_review', 'Negative_review')

clothing.head()

In [None]:
#### Seprating positive and negative reviews


pos_rev = clothing.loc[clothing["sentiment"] == "Positive_review", ['Review', 'sentiment']]

neg_rev = clothing.loc[clothing["sentiment"] == "Negative_review", ['Review', 'sentiment']]

In [None]:
#### Tokenization


pos_rev["Review"] = pos_rev["Review"].apply( lambda x: [x for x in re.split
                                                        (r"[^a-zA-Z']+", x) if len(x) > 1] )




neg_rev["Review"] = neg_rev["Review"].apply( lambda x: [x for x in re.split
                                                        (r"[^a-zA-Z']+", x) if len(x) > 1] )



print(pos_rev["Review"])
print(" ")
print(neg_rev["Review"])

In [None]:
#### Lemmatization (to find root words)


pos_rev["Review"] = pos_rev["Review"].apply( lambda x: [WordNetLemmatizer().lemmatize(w) for w in x] )


neg_rev["Review"] = neg_rev["Review"].apply( lambda x: [WordNetLemmatizer().lemmatize(w) for w in x] )

In [None]:
#### Joing Joining the words back to create a scentance


pos_rev["Review"] = pos_rev["Review"].apply( lambda x: ' '.join(x) )


neg_rev["Review"] = neg_rev["Review"].apply( lambda x: ' '.join(x) )

In [None]:
#### Vectorization


vectorization = CountVectorizer(stop_words=(list(set(list(ENGLISH_STOP_WORDS) + list(STOPWORDS) +["wa", "dd", "ha", "ve", "ll", "tt"]))))

In [None]:
#### Fitting the data

vectorization_pos = vectorization.fit(pos_rev["Review"])


vectorization_neg = vectorization.fit(neg_rev["Review"])

In [None]:
pos_col = list(vectorization_pos.get_feature_names_out())
neg_col = list(vectorization_neg.get_feature_names_out())

In [None]:
print("neg_col : ")
print(" ")
print(neg_col)
print(" ")
print(" ")
print(" ")
print("pos_col : ")
print(" ")
print(pos_col)

In [None]:
#### Transforming the data


vectorization_pos = vectorization_pos.transform(pos_rev["Review"]).todense()
vectorization_neg = vectorization_neg.transform(neg_rev["Review"]).todense()

In [None]:
#### Finding the frequency of meaningful words other then stopwords.
### Extracting only top 300 meaningful words  


pos_word_freq = pd.DataFrame(pd.DataFrame(
    vectorization_pos, columns=pos_col).sum(axis=0), 
                             columns=['freq']).reset_index().sort_values(by='freq', 
                                                                      ascending=False).head(300)



neg_word_freq = pd.DataFrame(pd.DataFrame(
    vectorization_neg, columns=neg_col).sum(axis=0), 
                             columns=['freq']).reset_index().sort_values(by='freq', 
                                                                      ascending=False).head(300)

In [None]:
#### Correcting column names


pos_word_freq.columns = ['word', 'freq']
neg_word_freq.columns = ['word', 'freq']

#### Most frequent words for positive reviews are : 

In [None]:
print(list(pos_word_freq.word))

#### Most frequent words for negative reviews are :

In [None]:
print(list(neg_word_freq.word))

#### Word Cloud

In [None]:
#### Creating list of positive words and their frequencies


words_pos = list(pos_word_freq.word)
freq_pos = list(pos_word_freq.freq)



#### Creating list of negative words and their frequencies


words_neg = list(neg_word_freq.word)
freq_neg = list(neg_word_freq.freq)



print("words_pos : ", words_pos)
print(" ")
print("freq_pos : ", freq_pos)
print(" ")
print("words_neg : ", words_neg)
print(" ")
print("freq_neg : ", freq_neg)

In [None]:
#### Creating dictionary using "words_pos" as "key" and "freq_pos" as its value.

pos_dict = dict(zip(words_pos, freq_pos))



#### Creating dictionary using "words_neg" as "key" and "freq_neg" as its value.

neg_dict = dict(zip(words_neg, freq_neg))



print("pos_dict : ", pos_dict)
print(" ")
print("neg_dict :", neg_dict )

In [None]:
# Generateing a word cloud for positive words

pos_word_cloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(pos_dict)


# Generateing a word cloud for negative words

neg_word_cloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(neg_dict)

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(pos_word_cloud)
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(neg_word_cloud)
plt.axis('off')
plt.show()

######

###### As we can clearly see in the above two word clouds, there are some words which are present in both word clouds. It is very difficult to understand which words in the review make the polarity of the review negative and which make it positive. To overcome this problem, we can identify those words that are present in either of the reviews.

In [None]:
#### Creating dictionary of positive words which are present in "pos_dict" but not in "neg_dict"

only_pos_dict = {key: value for key, value in pos_dict.items() if key not in neg_dict}



#### Creating dictionary of negative words which are present in "neg_dict" but not in "pos_dict"

only_neg_dict = {key: value for key, value in neg_dict.items() if key not in pos_dict}



print("only_pos_dict : ", only_pos_dict)
print(" ")
print("only_neg_dict : ", only_neg_dict)

In [None]:
# Genrating a word cloud for "only_pos_dict"

wordcloud_only_pos_dict = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(only_pos_dict)




# Genrating a word cloud for "only_pos_dict"

wordcloud_only_neg_dict = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(only_neg_dict)


In [None]:
plt.figure(figsize=(15,6))
plt.imshow(wordcloud_only_pos_dict)
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(wordcloud_only_neg_dict)
plt.axis('off')
plt.show()

# 

In [None]:
#### Relationship between coluumns 'Recommend_Flag' and 'sentiment'.

heatmap_data_2 = clothing[['Recommend_Flag', 
                         'sentiment']].pivot_table(index='Recommend_Flag', 
                                                columns='sentiment', aggfunc=len, fill_value=0)

In [None]:
# Visualization of Relationship between sentiment and Recommend_Flag 

plt.figure(figsize=(12, 5))

sns.heatmap(heatmap_data_2, annot=True, cmap='Pastel1', fmt='d', linewidths=.9)

plt.title('Heatmap: Relationship between sentiment and Recommend_Flag', pad=18)
plt.xlabel('sentiment')
plt.ylabel('Recommend_Flag')
plt.show()



#### With the help of this chart we ccan understand that even if sometimes review is positive but still it will not
#### get recommended and vice-versa

In [None]:
#### Relationship between coluumns 'sentiment' and 'Rating'.

heatmap_data_3 = clothing[['Rating', 
                         'sentiment']].pivot_table(index='Rating', 
                                                columns='sentiment', aggfunc=len, fill_value=0)

In [None]:
# Visualization of Relationship between Rating and sentiment 

plt.figure(figsize=(12, 5))

sns.heatmap(heatmap_data_3, annot=True, cmap='Pastel1', fmt='d', linewidths=.9)

plt.title('Heatmap: Relationship between sentiment and Rating', pad=18)
plt.xlabel('sentiment')
plt.ylabel('Rating')
plt.show()



#### With the help of this chart we ccan understand that even if sometimes review is positive but still it will not
#### get recommended and vice-versa

# 3. Understand sentiment among the customers on the different categories, sub categories, products by location and age group

#

In [None]:
#### Sentiment by Category


clothing.groupby(['Category', 'sentiment'])[['sentiment']].count()

In [None]:
#### Sentiment by Subcategory1


clothing.groupby(['Subcategory1', 'sentiment'])[['sentiment']].count()

In [None]:
#### Sentiment by SubCategory2


clothing.groupby(['SubCategory2', 'sentiment'])[['sentiment']].count()

In [None]:
#### Sentiment by location


clothing.groupby(['Location', 'sentiment'])[['sentiment']].count()

In [None]:
#### Sentiment by Age category


clothing.groupby(['cust_age_category', 'sentiment'])[['sentiment']].count()

#

# 4. Perform predictive analytics to understand the drivers of customers who are recommending the products.

# Binomial Classification of Recommend_Flag

#

In [None]:
#### To understand the customers behavior about recommending the products we need to perform classification modelling.

### In this case "Recommend_Flag" will be the y-variable. 
### As per above analysis Column 'Customer_Age' will be continous variable because this column has high cardinality.
### As per above analysis Columns 'Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel', 'Rating', 'Review' will \
#   categorical x-variables as these have low cardinality and column 'Review' is contain strings. 
### We need to clean the data in the column "Review" and need to vectorize the keywords.


####** For this analysis i will not be using "derived columns".

In [None]:
#### Creating a new DataFrame with only required columns


df = clothing[['Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel','Customer_Age', 'Rating', 'Recommend_Flag', 'Review']]

df = df.reset_index().drop(columns='index')

df.head()

In [None]:
#### Getting information about the new data

df.info()

In [None]:
#### As "Customer_Age" is continous variable we need to perform outlier treatment if required.
### Choosing IQR method for outlier treatment as range is not too big


q1 = df.Customer_Age.quantile(0.25)
q3 = df.Customer_Age.quantile(0.75)
iqr = q3-q1


upper_range = q3 + 1.5*iqr
lower_range = q1 - 1.5*iqr

print('upper_range : ', upper_range)
print('lower_range : ', lower_range)
print('iqr : ', iqr)
print('Customer max age : ', df.Customer_Age.max())
print('Customer min age : ', df.Customer_Age.min())
print("")
print('Number of records having customer age greater then upper_range : ', df.loc[df.Customer_Age >= upper_range].shape[0])

In [None]:
#### Treating outliers

df['Customer_Age'] = df['Customer_Age'].apply( lambda x: lower_range if x < lower_range else upper_range if x> upper_range else x )

In [None]:
#### Creating dummies for all categorical x-variables except column "Review"

#### It is giving True and False so converting it into 0 and 1 using .astype(int) function.

dummies_1 = pd.get_dummies(data=df[['Category', 'Subcategory1', 'SubCategory2', 'Location', 
                                    'Channel', 'Rating']], drop_first=True).astype(int)

dummies_1

In [None]:
#### Merging column "Review", "Customer_Age" and Recommend_Flag

df = pd.concat(objs=[dummies_1, df[["Customer_Age", "Review", "Recommend_Flag"]]], axis=1)


df.head()

In [None]:
# Checking if data is imbalanced

df.Recommend_Flag.value_counts()/df.Recommend_Flag.shape[0]


#### Data is imbalanced so splitting the data into train test and then sampling the data 
#### to avoid data leakage and data duplicacy in train and test

In [None]:
### Train test split

train, test = train_test_split(df, train_size=0.7, random_state=1)

print(train.shape)
print(test.shape)

In [None]:
#### train_data EDA

train.Recommend_Flag.value_counts()/train.Recommend_Flag.shape[0]

# Data is still imbalanced

In [None]:
# Making data balanced

# Using Under sampling technique for this
### Using astype(int) as y-variable should only be continous variable for classification

x_resampled, y_resampled = RandomOverSampler(random_state=1).fit_resample(train.iloc[:, :-1], train.iloc[:, -1].astype(int))


print(x_resampled.shape, y_resampled.shape)
print(test.shape)

In [None]:
#### Checking the balance of the data again to confirm

y_resampled.value_counts()/y_resampled.shape[0]

# Data is still imbalanced

In [None]:
#### Data cleaning in dataset "x_resampled" in the column "Review"
#### We need to perform TextBlob.correct(), Lemmatization, and "vectorization (TfidfVectorizer)"

#### Removing alphanumeric, numbers and non-alphanumeric charaters and also removing all alphabets which are single
#### Tokenization (word-wise) -- this is important for next step "Lemmatization"
#### Lemmatization to find the root words
#### Now we need to join back all words for this next step
#### We need to correct the spelling mistakes


def text_clean(y):
    y = y.apply( lambda x: x.lower() )
    y = y.apply( lambda x: [w for w in re.split(r"[^a-zA-Z']+", x) if len(w) > 1])
    y = y.apply( lambda x: [WordNetLemmatizer().lemmatize(w) for w in x] )
    y = y.apply( lambda x: ' '.join(x) )
#    y = y.apply( lambda x: str(TextBlob(x).correct()) )
    return y

In [None]:
#---Text data cleaning for "x_resampled" data ("train data")

x_resampled["Review"] = text_clean(x_resampled["Review"])



#---Text data cleaning for "test" data 


test["Review"] = text_clean(test["Review"])

In [None]:
#### Vectorization


vect = TfidfVectorizer(stop_words=stop_words+["ve", "ll", "isn", "don", "doesn", "didn", "tt", "ha", "wa"], min_df=150, max_df=0.99, max_features=300, ngram_range=(1,5))

vect = vect.fit(x_resampled['Review'])

vect_cols = vect.get_feature_names_out()

vect_matrix = vect.transform(x_resampled['Review']).todense()


In [None]:
text_dummies_train = pd.DataFrame(vect_matrix, columns=vect_cols)

text_dummies_train

In [None]:
text_dummies_test = pd.DataFrame(vect.transform(test["Review"]).todense(), columns=vect_cols)

text_dummies_test

In [None]:
#### Concating the above vectorised data and train data


train_data = pd.concat([x_resampled.iloc[:, :-1].reset_index().drop(columns='index'), text_dummies_train.reset_index().drop(columns='index')], axis=1)


#### Concating the above vectorised transformed data and test data


test_data = pd.concat([test.iloc[:, :-2].reset_index().drop(columns='index'), text_dummies_test.reset_index().drop(columns='index')], axis=1)

In [None]:
#### Correcting column names


train_data.columns = train_data.columns.str.replace("'", "_").str.replace(" ", "_")
train_data.columns = train_data.columns.str.replace("1", "_a_a").str.replace("2", "_b_b").str.replace("3", "_c_c").str.replace("4", "_d_d").str.replace("5", "_e_e")
print(list(train_data.columns))
print("")


test_data.columns = test_data.columns.str.replace("'", "_").str.replace(" ", "_")
test_data.columns = test_data.columns.str.replace("1", "_a_a").str.replace("2", "_b_b").str.replace("3", "_c_c").str.replace("4", "_d_d").str.replace("5", "_e_e")

print(list(test_data.columns))

#### Feature selection

In [None]:
# Using "SelectKBest"


skb_feat = SelectKBest(score_func=f_classif, k=20)
skb_feat = skb_feat.fit(train_data, y_resampled)


skb_list = list((train_data.columns)[skb_feat.get_support()])

print(skb_list)

In [None]:
# Using RFE


rfe_feat = RFE(estimator=LogisticRegression(), n_features_to_select=25)
rfe_feat = rfe_feat.fit(train_data, y_resampled)

rfe_list = list((train_data.columns)[rfe_feat.get_support()])

print(rfe_list)

In [None]:
# variable selected after combining above two

var_selected = list(set(rfe_list + skb_list))

print(var_selected)
print(len(var_selected))

###### Removing multi-colinerity

In [None]:
# Creating formulae to be used in dmatrices


f_like = 'Recommend_Flag ~ ' + ' + '.join(var_selected)

f_like


In [None]:
# To create a formula we need y and x variables in same dataframe
# creating a temprory dataframe for this by combining "y_resampled" and "train_data"


tem_df = pd.concat([y_resampled.reset_index().drop(columns='index'), train_data.loc[:, var_selected].reset_index().drop(columns = 'index')], axis = 1)

tem_df

In [None]:
#### Creating a blank dataframe named as vif

vif = pd.DataFrame()


#### Designing matrices from above created formulae (f_like)


y, x = dmatrices(formula_like=f_like, data=tem_df, return_type='dataframe')

In [None]:
#### Creating a new column with column names

vif['features'] = x.columns


#### Creating a new column having vales as variance_inflation_factor 

vif['vif_factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

In [None]:
x.columns

In [None]:
vif.head()

In [None]:
var_selected = list(vif.loc[vif.vif_factor <= 5, 'features'])

print(var_selected)

In [None]:
#### Standardizing the data

std = StandardScaler()

In [None]:
#### Fitting and transforming the trained data

train_transformed = std.fit_transform(train_data.loc[:,var_selected])


#### Transforming the test data

test_transformed =  std.transform(test_data.loc[:, var_selected])

In [None]:
#### Creating the dataframe for both trained and test data with appropiate column names


x_train = pd.DataFrame(data=train_transformed, columns=train_data.loc[:,var_selected].columns)
x_test = pd.DataFrame(data=test_transformed, columns=test_data.loc[:,var_selected].columns)

# Model 1: Logistics Regression

In [None]:
#### Creating param grid for Logistics Regression

param_grid_lr = {'penalty' : ['l1', 'l2'], 'C': [0.1, 0.01, 0.5, 0.05, 0.25, 0.75]}

In [None]:
#### Putting parameters in GridSearchCV

LR = GridSearchCV(LogisticRegression(),param_grid_lr, verbose=True, cv=5)

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables


LR.fit(x_train, y_resampled)

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data


df_LR_train = pd.DataFrame({'Actual': y_resampled, 'Predicted': LR.predict(x_train)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data


df_LR_test = pd.DataFrame({'Actual': test.Recommend_Flag.astype(int), 'Predicted': LR.predict(x_test)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_LR_train.Actual, df_LR_train.Predicted))

In [None]:
# Classification report of Test data

print(classification_report(df_LR_test.Actual, df_LR_test.Predicted))

##### Comments




###### -- Accuracy in both train and test data is same and high which is 94%.
###### -- For train data precision rate of class "0" is very high then test data.
###### -- Higher precision in both train and test data for class "1" indicates when model predicts 1 it is likely to be 1.
###### -- Recall value for both train and test data is higher and which indicates proportion of actual positive cases that are correctly identified. 

###### 

## Model 2: RandomForest

In [None]:
#### Creating param grid for RandomForest

param_grid_rf = {'n_estimators' : [100,200,300], 'max_leaf_nodes' : [4,7,10]}

In [None]:
#### Putting parameters in GridSearchCV


rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, verbose=True, cv=4)

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables


rf.fit(x_train, y_resampled)

In [None]:
#### Finding best parameters of RandomForest

rf.best_params_

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data

df_rf_train = pd.DataFrame({'Actual': y_resampled, 'Predicted': rf.predict(x_train)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data

df_rf_test = pd.DataFrame({'Actual': test.Recommend_Flag.astype(int), 'Predicted': rf.predict(x_test)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_rf_train.Actual, df_rf_train.Predicted))

In [None]:
# Classification report of Test data

print(classification_report(df_rf_test.Actual, df_rf_test.Predicted))

##### Comments




###### -- Accuracy in both train and test data is same and high which is 94%. It is same as Logistics Regression.
###### -- For train data precision rate of class "0" is very high then test data.
###### -- Higher precision in both train and test data for class "1" indicates when model predicts 1 it is likely to be 1.
###### -- Recall value for both train and test data is higher and which indicates proportion of actual positive cases that are correctly identified. 

## Model 3: XGBoost

In [None]:
#### Creating param grid for XGBoost

param_grid_xg = {'learning_rate' : [0.01, 0.05, 0.1], 'n_estimators' : [100,200,300]}

In [None]:
#### Putting parameters in GridSearchCV

xg = GridSearchCV(XGBClassifier(),param_grid_xg, verbose=True, cv=4)

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables

xg.fit(x_train, y_resampled)

In [None]:
#### Finding best parameters of XGBoost

xg.best_params_

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data

df_xg_train = pd.DataFrame({'Actual': y_resampled, 'Predicted': xg.predict(x_train)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data

df_xg_test = pd.DataFrame({'Actual': test.Recommend_Flag.astype(int), 'Predicted': xg.predict(x_test)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_xg_train.Actual, df_xg_train.Predicted))

In [None]:
# Classification report of Test data

print(classification_report(df_xg_test.Actual, df_xg_test.Predicted))

##### Comments




###### -- Accuracy in train data (0.96) is higher then test data (0.93). 
###### -- For train data precision rate of class "0" is very high then test data.
###### -- Higher precision in both train and test data for class "1" indicates when model predicts 1 it is likely to be 1.
###### -- Recall value in train data for class "0" is higher then test data.
###### -- Recall value in train data for class "1" is almost similar test data.

######

###### 

## All the above models have almost similar results, I would choose Random forest for final model as it is having high accuracy as welll as higher recall and precision values.

#

# Multinomial classification of rating based on Review

In [None]:
### We need to clean the data in the column "Review" and need to vectorize the keywords.

####** For this analysis i will not be using "derived columns".

In [None]:
#### To understand the customers behavior about Rating the products we need to perform classification modelling.

### In this case "Rating" will be the y-variable. 
### As per above analysis Column 'Customer_Age' will be continous variable because this column has high cardinality.
### As per above analysis Columns 'Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel', 'Review' will 
###   categorical x-variables as these have low cardinality and column 'Review' is contain strings. 
### We need to clean the data in the column "Review" and need to vectorize the keywords.


####** For this analysis i will not be using "derived columns".

In [None]:
#### Creating a new DataFrame with only required columns


data = clothing[['Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel','Customer_Age', 'Rating', 'Review']]

data = data.reset_index().drop(columns='index')

data.head()

In [None]:
#### Getting information about the new data

data.info()

In [None]:
#### As "Customer_Age" is continous variable we need to perform outlier treatment if required.
### Choosing IQR method for outlier treatment as range is not too big


q1_m = data.Customer_Age.quantile(0.25)
q3_m = data.Customer_Age.quantile(0.75)
iqr_m = q3_m-q1_m


upper_range_m = q3_m + 1.5*iqr_m
lower_range_m = q1_m - 1.5*iqr_m

print('upper_range : ', upper_range_m)
print('lower_range : ', lower_range_m)
print('iqr : ', iqr_m)
print('Customer max age : ', data.Customer_Age.max())
print('Customer min age : ', data.Customer_Age.min())
print("")
print('Number of records having customer age greater then upper_range : ', data.loc[data.Customer_Age >= upper_range_m].shape[0])

In [None]:
#### Treating outliers

data['Customer_Age'] = data['Customer_Age'].apply( lambda x: lower_range_m if x < lower_range_m else upper_range_m if x> upper_range_m else x )

In [None]:
#### Creating dummies for all categorical x-variables except column "Review"

#### It is giving True and False so converting it into 0 and 1 using .astype(int) function.

dummies_1_m = pd.get_dummies(data=data[['Category', 'Subcategory1', 'SubCategory2', 'Location', 
                                    'Channel']], drop_first=True).astype(int)

dummies_1_m

In [None]:
#### Merging column "Review", "Customer_Age" and Rating

data = pd.concat(objs=[dummies_1_m, data[["Customer_Age", "Review", "Rating"]]], axis=1)


data.head()

In [None]:
# Checking if data is imbalanced

data.Rating.value_counts()/data.Rating.shape[0]


#### Data is imbalanced so splitting the data into train test and then sampling the data 
#### to avoid data leakage and data duplicacy in train and test

In [None]:
### Train test split

train_m, test_m = train_test_split(data, train_size=0.7, random_state=1)

print(train_m.shape)
print(test_m.shape)

In [None]:
#### train_data EDA

train_m.Rating.value_counts()/train_m.Rating.shape[0]

# Data is still imbalanced

In [None]:
# Making data balanced

# Using Under sampling technique for this
### Using astype(int) as y-variable should only be continous variable for classification

x_resampled_m, y_resampled_m = RandomOverSampler(random_state=1).fit_resample(train_m.iloc[:, :-1], train_m.iloc[:, -1].astype(int))


print(x_resampled_m.shape, y_resampled_m.shape)
print(test_m.shape)

In [None]:
#### Checking the balance of the data again to confirm

y_resampled_m.value_counts()/y_resampled_m.shape[0]

# Data is still imbalanced

In [None]:
#### Data cleaning in dataset "x_resampled" in the column "Review"
#### We need to perform TextBlob.correct(), Lemmatization, and "vectorization (TfidfVectorizer)"

#### Removing alphanumeric, numbers and non-alphanumeric charaters and also removing all alphabets which are single
#### Tokenization (word-wise) -- this is important for next step "Lemmatization"
#### Lemmatization to find the root words
#### Now we need to join back all words for this next step
#### We need to correct the spelling mistakes


def text_clean(y):
    y = y.apply( lambda x: x.lower() )
    y = y.apply( lambda x: [w for w in re.split(r"[^a-zA-Z']+", x) if len(w) > 1])
    y = y.apply( lambda x: [WordNetLemmatizer().lemmatize(w) for w in x] )
    y = y.apply( lambda x: ' '.join(x) )
#    y = y.apply( lambda x: str(TextBlob(x).correct()) )
    return y

In [None]:
#---Text data cleaning for "x_resampled" data ("train data")

x_resampled_m["Review"] = text_clean(x_resampled_m["Review"])



#---Text data cleaning for "test" data 


test_m["Review"] = text_clean(test_m["Review"])

In [None]:
#### Vectorization


vect_m = TfidfVectorizer(stop_words=stop_words+["ve", "ll", "isn", "don", "doesn", "didn", "tt", "ha", "wa"], min_df=150, max_df=0.99, max_features=300, ngram_range=(1,5))

vect_m = vect_m.fit(x_resampled_m['Review'])

vect_cols_m = vect_m.get_feature_names_out()

vect_matrix_m = vect_m.transform(x_resampled_m['Review']).todense()


In [None]:
text_dummies_train_m = pd.DataFrame(vect_matrix_m, columns=vect_cols_m)

text_dummies_train_m

In [None]:
text_dummies_test_m = pd.DataFrame(vect_m.transform(test_m["Review"]).todense(), columns=vect_cols_m)

text_dummies_test_m

In [None]:
#### Concating the above vectorised data and train data


train_data_m = pd.concat([x_resampled_m.iloc[:, :-1].reset_index().drop(columns='index'), text_dummies_train_m.reset_index().drop(columns='index')], axis=1)


#### Concating the above vectorised transformed data and test data


test_data_m = pd.concat([test_m.iloc[:, :-2].reset_index().drop(columns='index'), text_dummies_test_m.reset_index().drop(columns='index')], axis=1)

In [None]:
#### Correcting column names


train_data_m.columns = train_data_m.columns.str.replace("'", "_").str.replace(" ", "_")
train_data_m.columns = train_data_m.columns.str.replace("1", "_a_a").str.replace("2", "_b_b").str.replace("3", "_c_c").str.replace("4", "_d_d").str.replace("5", "_e_e")
print(list(train_data_m.columns))
print("")


test_data_m.columns = test_data_m.columns.str.replace("'", "_").str.replace(" ", "_")
test_data_m.columns = test_data_m.columns.str.replace("1", "_a_a").str.replace("2", "_b_b").str.replace("3", "_c_c").str.replace("4", "_d_d").str.replace("5", "_e_e")

print(list(test_data_m.columns))

#### Feature selection

In [None]:
# Using "SelectKBest"


skb_feat_m = SelectKBest(score_func=f_classif, k=331)
skb_feat_m = skb_feat_m.fit(train_data_m, y_resampled_m)


var_selected_m = list((train_data_m.columns)[skb_feat_m.get_support()])

print(var_selected_m)

##### Removing multi-collinearity

In [None]:
#### Correalation between variables in absolute values.

multi_c = train_data_m.corr().abs()

In [None]:
#### Creating a dataframe containing indexes as variable names and values representing number of vaiable it is collinear with.


multi_c = pd.DataFrame(((multi_c >0.5).sum() > 1).astype(int), columns= ['t_f'])

In [None]:
#### Creating a list having only those variables which have higher multi-collinearity


multi_c_list = list(multi_c[multi_c.t_f == 1].index)

In [None]:
print("Variables to drop from var_selected_m to remove multicollinearity", multi_c_list)

In [None]:
#### Dropping variables having multi-collinearity


var_selected_m = list(set(var_selected_m) - set(multi_c_list))

print(var_selected_m)

In [None]:
#### Standardizing the data

std_m = StandardScaler()

In [None]:
#### Fitting and transforming the trained data

train_transformed_m = std_m.fit_transform(train_data_m.loc[:,var_selected_m])


#### Transforming the test data

test_transformed_m =  std_m.transform(test_data_m.loc[:, var_selected_m])

In [None]:
#### Creating the dataframe for both trained and test data with appropiate column names


x_train_m = pd.DataFrame(data=train_transformed_m, columns=train_data_m.loc[:,var_selected_m].columns)
x_test_m = pd.DataFrame(data=test_transformed_m, columns=test_data_m.loc[:,var_selected_m].columns)

# Model 1: Logistics Regression

In [None]:
#### Creating param grid for Logistics Regression

param_grid_lr_m = {'penalty' : ['l1', 'l2'], 'C': [0.1, 0.01, 0.5, 0.05, 0.25, 0.75]}

In [None]:
#### Putting parameters in GridSearchCV

LR_m = GridSearchCV(LogisticRegression(),param_grid_lr_m, verbose=True, cv=5)

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables


LR_m.fit(x_train_m, y_resampled_m)

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data


df_LR_train_m = pd.DataFrame({'Actual': y_resampled_m, 'Predicted': LR_m.predict(x_train_m)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data


df_LR_test_m = pd.DataFrame({'Actual': test_m.Rating.astype(int), 'Predicted': LR_m.predict(x_test_m)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_LR_train_m.Actual, df_LR_train_m.Predicted))

In [None]:
# Classification report of Test data

print(classification_report(df_LR_test_m.Actual, df_LR_test_m.Predicted))

##### Comments




###### -- Accuracy in both train data (0.56) adn test data (0.56) is same. 
###### -- Precision for all calsses have huge gap in both train and test data.
###### -- Recall value in train data for class "5" is higher for both train (0.70) and test (0.68) data.
###### -- f1-score in train data for class "5" is higher for both train (0.68) and test (0.75) data.

######

## Model 2: RandomForest

In [None]:
#### Creating param grid for RandomForest

param_grid_rf_m = {'n_estimators' : [100,200,300], 'max_leaf_nodes' : [4,7,10]}

In [None]:
#### Putting parameters in GridSearchCV


rf_m = GridSearchCV(RandomForestClassifier(), param_grid_rf_m, verbose=True, cv=4)

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables


rf_m.fit(x_train_m, y_resampled_m)

In [None]:
#### Finding best parameters of RandomForest

rf_m.best_params_

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data

df_rf_train_m = pd.DataFrame({'Actual': y_resampled_m, 'Predicted': rf_m.predict(x_train_m)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data

df_rf_test_m = pd.DataFrame({'Actual': test_m.Rating.astype(int), 'Predicted': rf_m.predict(x_test_m)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_rf_train_m.Actual, df_rf_train_m.Predicted))

In [None]:
# Classification report of Test data

print(classification_report(df_rf_test_m.Actual, df_rf_test_m.Predicted))

##### Comments




###### -- Model is under-fitted as accuracy in train data (0.42) is lower the test data(0.51).
###### -- Precision is lower for all calsses in both train and test data.
###### -- Recall value in train data for class "5" is higher for both train (0.70) and test (0.68) data.
###### -- f1-score in train data for class "5" is higher for both train (0.68) and test (0.75) data.

######

## Model 3: XGBoost

In [None]:
#### Creating param grid for XGBoost

param_grid_xg_m = {'learning_rate' : [0.01, 0.05, 0.1], 'n_estimators' : [100,200,300]}

In [None]:
#### Putting parameters in GridSearchCV

xg_m = GridSearchCV(XGBClassifier(),param_grid_xg_m, verbose=True, cv=4)

###### In XGBoost model y-variable values starts with 0 so subtracting all with 1

In [None]:
#### Fitting the data in trained data for x-variables and trained data for y_variables
y_resampled_m_a = y_resampled_m.apply(lambda x: x-1)
xg_m.fit(x_train_m, y_resampled_m_a)

In [None]:
#### Finding best parameters of XGBoost

xg_m.best_params_

In [None]:
#### Creating dataframe having actual y-variable of trained data and predicted y-variable of trained data

df_xg_train_m = pd.DataFrame({'Actual': y_resampled_m_a, 'Predicted': xg_m.predict(x_train_m)})


#### Creating dataframe having actual y-variable of test data and predicted y-variable of test data

test_m_Rating_m_a = test_m.Rating.astype(int).apply(lambda x: x-1)
df_xg_test_m = pd.DataFrame({'Actual': test_m_Rating_m_a, 'Predicted': xg_m.predict(x_test_m)})

#### Errors

In [None]:
# Classification report of Train data

print(classification_report(df_xg_train_m.Actual+1, df_xg_train_m.Predicted+1))

In [None]:
# Classification report of Test data

print(classification_report(df_xg_test_m.Actual+1, df_xg_test_m.Predicted+1))

##### Comments




###### -- Model is overfitted as accuracy in train data (0.91) is very higher then the test data(0.58).
###### -- Precision for class 5 is higher in both train and test data but for all other classes precision is low.
###### -- Recall value for call calsses is lower in test data then the train data.
###### -- f1-score for call calsses is lower in test data then the train data.

## Logistics Regression is better compared all models. 

###### After creating multiple models we can say "Logistics Regression" is the best model among all 3 in terms of accuracy
###### but for recall, f1-score and precision none of the models are good.

######

# 5. Create topics and understand themes behind the topics by performing topic mining.

#

In [None]:
Review = clothing.reset_index().drop(columns='index')
Review = Review.Review
Review

In [None]:
#### Cleaning the column "Review" in the dataframe "clothing" and storing as in "Reviews" as series.

Reviews = text_clean(Review)
Reviews

In [None]:
#### Stopwords to remove from this series "Reviews"

stop_words = stop_words+["ve", "ll", "isn", "don", "doesn", "didn", "tt", "ha", "wa"]

print(stop_words)

In [None]:
#### Removing the stop words from "Reviews" by using list comprehension

Reviews = Reviews.apply( lambda x: [w for w in x.split() if w not in stop_words] )


print(Reviews)

In [None]:
#### Don't want single character words

Reviews = Reviews.apply(lambda x: [w for w in x if len(w) > 1])

In [None]:
Reviews

In [None]:
#### Creating "corpora.Dictionary" for "Reviews" (list of words)


Reviews_dict = corpora.Dictionary(Reviews)

print(Reviews_dict)

In [None]:
#### Converting Reviews_dict into document term matrix


doc_term_matrix = [Reviews_dict.doc2bow(w) for w in Reviews]

doc_term_matrix

In [None]:
#### Creating an object for LDA model using gensim library


LDA = gensim.models.ldamodel.LdaModel

In [None]:
#### Creating a for loop to find the best value of "number of topics" 

for i in range(2,15):
    LDA = gensim.models.ldamodel.LdaModel
    ldamodel = LDA(doc_term_matrix, num_topics=i, id2word=Reviews_dict, passes=1, minimum_probability=0)
    print(i," : ",ldamodel.log_perplexity(doc_term_matrix))
    
    
    
'''Lower the log_perplexity, better the model'''

In [None]:
#### Running the LDA model on doc_term_matrix with num_topics = 14 as log_perplexity for 14 topics is highest (-7.450).

%time ldamodel = LDA(doc_term_matrix, num_topics=14, id2word=Reviews_dict, passes=14, minimum_probability=0, random_state=1)

In [None]:
#Visualization

pyLDAvis.enable_notebook()

pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, Reviews_dict, sort_topics=False, mds='mmds')
#pyLDAvis.display(ldamodel)

In [None]:
## In the above model, clusters are overlapping with each other (in visualization)
## So, This model is not best fit for this analysis

In [None]:
# This time lets try with num_topics = 13
# log_perplexity = -7.402

%time ldamodel = LDA(doc_term_matrix, num_topics=13, id2word=Reviews_dict, passes=14, minimum_probability=0, random_state=1)

In [None]:
#Visualization

pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, Reviews_dict, sort_topics=False, mds='mmds')
#pyLDAvis.display(ldamodel)

In [None]:
## In the above model also, clusters are overlapping with each other (in visualization)
## So, This model is not best fit for this analysis

In [None]:
# This time lets try with num_topics = 12
# log_perplexity = -7.347

%time ldamodel = LDA(doc_term_matrix, num_topics=12, id2word=Reviews_dict, passes=14, minimum_probability=0, random_state=1)

In [None]:
#Visualization

pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, Reviews_dict, sort_topics=False, mds='mmds')
#pyLDAvis.display(ldamodel)

In [None]:
## In the above model also, clusters are overlapping with each other (in visualization)
## So, This model is not best fit for this analysis

In [None]:
# This time lets try with num_topics = 11
# log_perplexity = -7.347

%time ldamodel = LDA(doc_term_matrix, num_topics=11, id2word=Reviews_dict, passes=14, minimum_probability=0, random_state=1)

In [None]:
#Visualization

pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, Reviews_dict, sort_topics=False, mds='mmds')
#pyLDAvis.display(ldamodel)

###### This model is final as there is no overlapping among clusters.

In [None]:
print("log_perplexity : ", ldamodel.log_perplexity(doc_term_matrix))
print("--------------------------------------------------------------------")
for i in range(0,11):
    print("Topic : ", i)
    print(ldamodel.print_topic(i, topn=10))
    print("__________")

In [None]:
#### Printing topics with weightage of words

ldamodel.print_topics(num_topics = 11, num_words=20)

In [None]:
#### Extracting topics from above excluding weightage


def extracting_topics(weighted_tuples_list):
    all_sentences = []

    for _, topic_string in weighted_tuples_list:
        words_list = re.findall(r'"([^"]*)"', topic_string)
        sentence = ' '.join(words_list)
        all_sentences.append(sentence)

    return all_sentences

In [None]:
#### Extracting topics from above excluding weightage


topics = pd.Series(extracting_topics(ldamodel.print_topics(num_topics = 20, num_words=20)))

topics

In [None]:
count=0
for i in ldamodel[doc_term_matrix]:
    print( "doc : ", count, i)
    count+=1

In [None]:
def format_topics_sentences(ldamodel=ldamodel, corpus=doc_term_matrix, texts=Reviews):
    # Init output
    rows = []

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                row_data = [int(topic_num), round(prop_topic, 4), topic_keywords, texts[i]]
                rows.append(row_data)
                break

    sent_topics_df = pd.DataFrame(rows, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Original_Text'])
    return sent_topics_df



In [None]:
Themes = format_topics_sentences(ldamodel=ldamodel, corpus=doc_term_matrix, texts=Reviews)
Themes

# End