Pre-processing and read in files


In [2]:
import pandas as pd
import os

In [3]:
workspace = r"D:\User\Documents\SMU CONTENT\Year 3 Sem 2\IS450\Project\Main\Exploration"
os.chdir(workspace)
path = r'TM_LDA_coherence200+.csv'
topicData = pd.read_csv(path)
df1 = pd.DataFrame(topicData)

In [4]:
columns_interest = ['ProductId', 'Original', 'Text', 'product_category', 'Automated_topic_id']

In [5]:
df2 = df1[columns_interest].copy()

In [6]:
df2.head()

Unnamed: 0,ProductId,Original,Text,product_category,Automated_topic_id
0,B000LKU03G,This is my family's favorite brand of wheat fr...,This is my family's favorite brand of wheat fr...,'Cakes',quality
1,B000LKU03G,This is my family's favorite brand of wheat fr...,"This brand is moist, tasty and closer to the t...",'Cakes',taste
2,B000LKU03G,This is my family's favorite brand of wheat fr...,Namaste products are the best in my opinion of...,'Cakes',quality
3,B000LKU03G,This is my family's favorite brand of wheat fr...,Try it,'Cakes',quality
4,B000LKU03G,This is my family's favorite brand of wheat fr...,You won't be disappointed,'Cakes',taste


Clean the product category column values, additional '' marks are include, we want to remove them

In [7]:
def clean_product_cat(category):
    """
    clean product category columns
    """    
    return category.rstrip("/'").lstrip(" '")

In [8]:
df2['product_category'] = df2['product_category'].apply(clean_product_cat)

In [25]:
def clean_text(text):
    return text.replace('<br />', '')

In [26]:
df2['Text'] = df2['Text'].apply(clean_text)

In [27]:
df2.head()

Unnamed: 0,ProductId,Original,Text,product_category,Automated_topic_id
0,B000LKU03G,This is my family's favorite brand of wheat fr...,This is my family's favorite brand of wheat fr...,Cakes,quality
1,B000LKU03G,This is my family's favorite brand of wheat fr...,"This brand is moist, tasty and closer to the t...",Cakes,taste
2,B000LKU03G,This is my family's favorite brand of wheat fr...,Namaste products are the best in my opinion of...,Cakes,quality
3,B000LKU03G,This is my family's favorite brand of wheat fr...,Try it,Cakes,quality
4,B000LKU03G,This is my family's favorite brand of wheat fr...,You won't be disappointed,Cakes,taste


Create a list of categories 

In [28]:
categoryLs = df2.groupby('product_category').count().sort_values(by = 'Text').index.tolist()

In [29]:
categoryLs[1]

'False Eyelashes & Adhesives'

Define our helper functions

In [30]:


import nltk
from nltk.tokenize import word_tokenize
import time

def create_df(data,product_cat = 'Canola'):
    """
    returns a new df from product_cat
    """
    df1 = data[data['product_category'] == product_cat].copy()
    return df1


def pre_process(topicDf):
    
    """
    given a topic df, will return a main set of hot terms, using pos-tagging
    """
    counter = 0 # count number of passed rows
    main_set = set()
    text_ls = list(topicDf['Text'].values)
    for text in text_ls: # for each comment
        try:
            tagsText = nltk.pos_tag(word_tokenize(text)) # tokenize first then postag
            main_set.update([x[0].lower() for x in tagsText if (x[1] == 'NN' and len(x[0])>1)]) # only keep track of the nouns and words with length 2   
        except:
            continue
    return main_set    


from collections import Counter


def applyValue(text, hotTopics):
    
    """
    submethod apply for main method text val
    """
    
    VALUE = 0
    
    text_token = word_tokenize(text)
    text_token_lower = [text.lower() for text in text_token]
    text_counter = Counter(text_token_lower)
    for k,v in text_counter.items():
        if k in hotTopics:
            VALUE += v
    return VALUE




def textVal(topicDf, hotTopics):
    
    """
    create 2 new df columns:
    1. value of text
    2. value/length ratio    
    """
    topicDf['Text'] = topicDf['Text'].apply(str)
    topicDf['textValue'] = topicDf['Text'].apply(applyValue, hotTopics = hotTopics)
    topicDf['textValue/length'] = topicDf['textValue']/topicDf['Text'].str.len() # vectorize ratio
    
    return topicDf


def summarizerOpt(topicDf, word_limit = 100):
    """
    using a greedy approach, returns the generated summary
    """
    
    # sort df by col textValue/len
    if len(topicDf) < 2:
        text = topicDf['Text'].values[0]   # if only one comment then return the original comment
    else:
        sortedTopicDf = topicDf.sort_values(by = ['textValue/length'], ascending = False).copy()
        resultLs = []
        for i in range(len(sortedTopicDf)):
            if len(sortedTopicDf.iloc[i]['Text'].split()) > word_limit or 'nan' in sortedTopicDf.iloc[i]['Text']:
                continue
            else:
                resultLs += [i]
                word_limit -= len(sortedTopicDf.iloc[i]['Text'].split())
        text = ''
        for i in sorted(resultLs):   # rank the sentence in the original rank and then form the summary
            text += sortedTopicDf['Text'].values[i] + '. '   # join the sentences
        if len(text) == 0:
            text = sortedTopicDf.iloc[0]['Text'] # if greedy finds none return the first text
    return text

Def Main

In [31]:
def main_summarizer_opt(data, categoryLs):
    """
    using submethods, will generate summaries for each topic and will
    return a dataframe with column {categoryID, topic, summary, originalText}
    """
    
    summary_df = pd.DataFrame(columns = ['categoryID', 'topic', 'summary', 'originalText'])
    for i in range(len(categoryLs)): 
        print("Currently summarising ", categoryLs[i])
        try:
            if i%20 == 0: #sleep after every 20 topics
                time.sleep(1) #pause for 10 seconds
            if i > 180:
                time.sleep(1)
            tempDf = create_df(data, categoryLs[i])
            tempTopicLs = list(tempDf['Automated_topic_id'].unique())
            for j in range(len(tempTopicLs)):
                tempTopicDf = tempDf[tempDf['Automated_topic_id'] == tempTopicLs[j]].copy() # create df with just a single topic
                tempTextLs = tempTopicDf['Text'].apply(str).tolist()
                combined_text = '. '.join(tempTextLs)
                tempHotTopics = pre_process(tempTopicDf)
                summary_out = summarizerOpt(textVal(tempTopicDf, tempHotTopics))
                summary_df = summary_df.append({'categoryID':categoryLs[i],'topic':tempTopicLs[j], 'summary':summary_out, 'originalText':combined_text}, ignore_index=True)
        except:
            print("Exception occured at category", categoryLs[i])
            continue
    return summary_df

In [33]:
summary_out = main_summarizer_opt(df2, categoryLs)

Currently summarising  Sunflowers
Currently summarising  False Eyelashes & Adhesives
Currently summarising  Pastry Shells & Crusts
Currently summarising  Basic Collars
Currently summarising  Hot Dogs & Franks
Currently summarising  Indoor Gardening & Hydroponics
Currently summarising  Cranberry
Currently summarising  Pasta & Sauces
Currently summarising  Shiitake
Currently summarising  Shortening
Currently summarising  Cayenne
Currently summarising  Vines
Currently summarising  Breakfast Foods
Currently summarising  Wrapping & Packaging
Currently summarising  Cloves
Currently summarising  Coffee Cups & Mugs
Currently summarising  Apple Cider
Currently summarising  Deli-Sliced Meats
Currently summarising  Processed
Currently summarising  Lilies
Currently summarising  Stuffing Side Dishes
Currently summarising  Apricots
Currently summarising  Pappardelle
Currently summarising  Allspice
Currently summarising  Sirloin
Currently summarising  Eggs & Egg Substitutes
Currently summarising  Ice

Looking at an example summary

In [45]:
print('Summary: ',summary_out['summary'].iloc[34], end='\n\n')
print("Original Text: ",summary_out['originalText'].iloc[34])


Summary:  :(From the time I was a small child, my mother was an advocate of apple cider vinegar and honey for health. This apple cider vinegar is great for incorporation into one's beauty regimen or household use. There is a 5-digit code on top of the cap. Works wonders for dry itchy scalp. I don't believe I've ever used apple cider vinegar in food before, but it's great for hair. Sometimes I like it in water, a tsp/16oz. The mixture was done perfectly, No stinging taste, it's sort of mellow, yet has that flavor of vinegar you are looking for. 

Original Text:  So much better for the hot summer months than sugary lemonade. :(From the time I was a small child, my mother was an advocate of apple cider vinegar and honey for health. I would best describe the effects as very close to the good feeling you experience when you drink coffee. Let me preface this by saying I purchased it at a local health food store for about a dollar less. The mixture was done perfectly, No stinging taste, it's 

Write to file

In [47]:
summary_out.to_csv('POSopt_summary.csv')