Load data from the LDA model 

In [3]:
import pandas as pd
import os
from gensim.summarization.summarizer import summarize

In [5]:
workspace = r"D:\User\Documents\SMU CONTENT\Year 3 Sem 2\IS450\Project\Main\Exploration"
os.chdir(workspace)
path = r'TM_LDA_coherence200+.csv'
topicData = pd.read_csv(path)

df1 = pd.DataFrame(topicData)

Remove first column as duplicate

In [6]:
df2 = df1[df1.columns[1:]].copy()

In [7]:
columns_interest = ['ProductId', 'Original', 'Text', 'product_category', 'Automated_topic_id']

In [8]:
df3 = df2[columns_interest].copy()

In [9]:
df3.head()

Unnamed: 0,ProductId,Original,Text,product_category,Automated_topic_id
0,B000LKU03G,This is my family's favorite brand of wheat fr...,This is my family's favorite brand of wheat fr...,'Cakes',quality
1,B000LKU03G,This is my family's favorite brand of wheat fr...,"This brand is moist, tasty and closer to the t...",'Cakes',taste
2,B000LKU03G,This is my family's favorite brand of wheat fr...,Namaste products are the best in my opinion of...,'Cakes',quality
3,B000LKU03G,This is my family's favorite brand of wheat fr...,Try it,'Cakes',quality
4,B000LKU03G,This is my family's favorite brand of wheat fr...,You won't be disappointed,'Cakes',taste


Clean the product category column values, additional '' marks are include, we want to remove them

In [10]:
def clean_product_cat(category):
    """
    clean product category columns
    """    
    return category.rstrip("/'").lstrip(" '")

In [12]:
df3['product_category'] = df3['product_category'].apply(clean_product_cat)

Create a list of categories 

In [13]:
categoryLs = df3.groupby('product_category').count().sort_values(by = 'Text').index.tolist()

In [15]:
categoryLs[1]

'False Eyelashes & Adhesives'

Two helper functions to help us to first create individual dataframes for each product category, and then to create individual topic dataframes from each of these dataframes to generate aspect-based summaries 

In [16]:
def create_df(data,product_cat = 'False Eyelashes & Adhesives'):
    """
    returns a new df from product_cat
    """
    df1 = data[data['product_category'] == product_cat].copy()
    return df1


def summarizerMod(data, wordCount=100):
    
    """
    data : input dataframe from create_df function
    input dataframe by category, returns a list of summaries for each 
    topic. outputs all the summary for each given topic
    """
    summary_results = []
    topic_ls = list(data['Automated_topic_id'].unique())
    combined_text = ''
    for i in range(len(topic_ls)):
            temp_df = data[data['Automated_topic_id'] == topic_ls[i]].copy()
            temp_df = temp_df['Text'].apply(str).apply(lambda x: x.capitalize()).apply(lambda x: x.replace('<br />','')).copy().unique() # add in lambda to replace the br statements
            temp_ls = temp_df.tolist()
            combined_text = '. '.join(temp_ls) # must have additional space if not summarizer cannot tokenize
            try:
                summary_out = summarize(combined_text+'.', word_count = wordCount, split = True)
                summary_out = ''.join(summary_out)
                summary_results.append((topic_ls[i],summary_out,combined_text, True)) #take only the first 4
            except ValueError:
                summary_results.append((topic_ls[i],combined_text, combined_text, False))
    return summary_results

Create empty df to store our summaries. We will output a category id, a topic, a summary as well as the original text.

In [17]:
finalDf = pd.DataFrame(columns = ['categoryID', 'topic', 'summary', 'originalText','summary_generated'])

### Run Main

In [19]:
pathOut = "gensim_out200+.csv"
for i in range(len(categoryLs)):
    tempdf = create_df(df3, categoryLs[i])
    print(f"\r Currently summarising {categoryLs[i]} !", end = "")
    summaries = summarizerMod(tempdf,wordCount=100)
    for j in range(len(summaries)):
        try:
            finalDf = finalDf.append({'categoryID' : categoryLs[i],'topic' : summaries[j][0], 'summary' : summaries[j][1],'originalText':summaries[j][2],'summary_generated':summaries[j][3]}, ignore_index=True)
            finalDf.to_csv(pathOut)
        except:
            finalDf.to_csv(pathOut)
            print("caught exception at ", "j = ", j, "i = ", i)
            continue 
print('')
print('completed!')

 Currently summarising Cocoa !Butter !se !Bars !!es !ories !
completed!


In [20]:
finalDf

Unnamed: 0,categoryID,topic,summary,originalText,summary_generated
0,Sunflowers,Others,"When the flowers showed up, they were in prett...","When the flowers showed up, they were in prett...",True
1,False Eyelashes & Adhesives,taste,These truffles melt slowly in your mouth with ...,These truffles melt slowly in your mouth with ...,True
2,Pastry Shells & Crusts,Others,Was quite pleased with the product.Would buy a...,Was quite pleased with the product. Would buy ...,True
3,Pastry Shells & Crusts,quality,,Arrived on time and wrapped well. The service ...,True
4,Basic Collars,taste,I have a pit bull who is a little over a year ...,I have a pit bull who is a little over a year ...,True
...,...,...,...,...,...
443,Snacks,taste,"One of my six cats is super finicky, he likes ...","In the future, i'd be more apt to open up a 69...",True
444,Fruit & Nut,taste,I love these bars as they are really filling a...,I would have liked to give these 2 and a half ...,True
445,Peanut Butter,taste,I use pb2 for my protein shakes and oh man doe...,"It's got the typical jif taste, which to me is...",True
446,Peanut Butter,quality,Things i love about this product:it's organici...,Good for dipping pretzels or celery/other vegg...,True
