# List of Input Parameters

In [1]:
stop_words_in_phrases = None #or 'english'
no_of_phrases_per_review = 10
max_no_of_words_per_phrase = 8
no_of_decimal_places_in_avg_rating = 2
column_to_sort_output = ['Phrase_Avg_Rating'] #or '#_PIDs'
sort_in_ascending_order = False #or True

# Importing Packages

In [2]:
import pandas as pd
from keybert import KeyBERT
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
import warnings
warnings.filterwarnings("ignore")

C:\Users\kshit\anaconda3\envs\tf\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\kshit\anaconda3\envs\tf\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [3]:
df = pd.read_excel('Sample for Generating Annotated Data (1).xlsx')   #reading the input file
reviews = df['Review']   #picking the reviews series from input dataframe

# Phrase Extraction

In [4]:
kw_model = KeyBERT()

#pnt => phrase and topic
pnt_df = pd.DataFrame(columns=['Review','phrase','Topic'])  #dataframe to store phrases and topics of reviews

for j in range(len(reviews)):   #looping over all reviews in input file
    #kp => key phrases
    kp = pd.DataFrame(kw_model.extract_keywords(reviews[j],
                                                keyphrase_ngram_range=(2,max_no_of_words_per_phrase),
                                                top_n=no_of_phrases_per_review,
                                                stop_words=stop_words_in_phrases,
                                                use_mmr=True
                                                ) 
                     ,columns = ['phrase','score'])   #extracting the phrases from review
    kp.insert(1,'Topic','')   #inserting Topic column
    kp.insert(0,'Review',reviews[j])   #inserting Review column and storing the review in it
    kp.drop(['score'], axis=1,inplace=True)   #dropping score column
    pnt_df = pnt_df.append(kp,ignore_index = True)   #appending kp dataframe to final_df

# Topic Extraction

In [5]:
# docs = list(pnt_df.loc[:, "phrase"].values)   #making a list of phrases
#cleaning of phrases before topic extraction    
stopwords = nltk.corpus.stopwords.words('english')

def tokenization(text):
    tk = WhitespaceTokenizer()
    return tk.tokenize(text)

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

docs = pnt_df['phrase'].apply(lambda x: tokenization(x))
docs = docs.apply(lambda x:remove_stopwords(x))

for i in range(len(docs)):
    docs[i] = " ".join(docs[i]) 

In [6]:
model = BERTopic(language="english")   #defining the model
topics, probs = model.fit_transform(docs)   #passing the list of phrases inside the model(training)

predicted_topics, predicted_probs = model.transform(docs)   #predicting the topic associated with the phrase
for i in range(len(pnt_df)):
    pnt_df['Topic'][i] = model.get_topic(predicted_topics[i])[0][0] #inserting the topics for each phrase

In [9]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,255,-1_absorbs_use_face_feet
1,0,128,0_moisturizer_moisturizers_moisturizing_favorite
2,1,69,1_dry_patches_skin_sensitive
3,2,61,2_hydrating_hydration_hydrated_texture
4,3,47,3_use_used_ever_daily
5,4,46,4_product_reduce_worked_found
6,5,45,5_skin_bother_bc_saved
7,6,43,6_price_sale_recommend_black
8,7,42,7_told_said_could_say
9,8,39,8_acne_prone_oily_signs


# Output File Production

In [7]:
#creating a dataframe which contain the count of phrases corresponding to each topic
df1 = pd.DataFrame(pnt_df.groupby(['Topic']).count()['phrase'])
df1 = df1.reset_index()
df1.columns = ['Topic','#_PIDs']   #naming of columns

#creating a dataframe which contain the rating corresponding to each phrase
df2 = pd.merge(df,pnt_df,how='inner',on='Review')   #Joining input dataframe and final_df
df2['Rating']=df2['Rating'].str.split(' ',expand=True)[0]   #splitting the rating column to get the numerical values of rating
df2['Rating'] = df2['Rating'].astype(int)   #converting string into integer for computation

df3 = pd.DataFrame(pd.DataFrame(df2.groupby(['Topic','Review']).mean()).groupby(['Topic']).mean()['Rating'])
df3 = df3.reset_index()
df3.columns = ['Topic','Review_Avg_Rating']
df3['Review_Avg_Rating'] = round(df3['Review_Avg_Rating'],no_of_decimal_places_in_avg_rating)

#creating a dataframe which contain the average rating corresponding to each topic
df2 = pd.DataFrame(df2.groupby(['Topic']).mean()['Rating'])
df2 = df2.reset_index()
df2.columns = ['Topic','Phrase_Avg_Rating']   #naming of columns
df2['Phrase_Avg_Rating'] = round(df2['Phrase_Avg_Rating'],no_of_decimal_places_in_avg_rating)   #round decimal places in average rating

#Joining of df1 and df2 and creating an output dataframe df3
df4 = pd.merge(df1,df2,how='inner',on='Topic')
df4 = pd.merge(df4,df3,how='inner',on='Topic')
df4 = df4.sort_values(by=column_to_sort_output,ascending=sort_in_ascending_order)   
df4.insert(loc=0,column='Tid',value = range(1,len(df4)+1))   #Providing topic id to the topics
df4['Tid']=df4['Tid'].astype(str)
df4['Tid']='t'+df4['Tid']
df4.index = range(len(df4))

#df4.to_csv('output_file.csv',index=False)   #uncomment to export output file

In [8]:
df4

Unnamed: 0,Tid,Topic,#_PIDs,Phrase_Avg_Rating,Review_Avg_Rating
0,t1,redness,23,4.87,4.75
1,t2,grail,11,4.73,4.57
2,t3,bottle,13,4.69,4.6
3,t4,love,23,4.65,4.63
4,t5,hydrating,74,4.64,4.51
5,t6,barrier,25,4.6,4.43
6,t7,winter,32,4.59,4.62
7,t8,staple,12,4.58,4.0
8,t9,eczema,23,4.57,4.0
9,t10,greasy,17,4.47,4.27
