In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import spacy

nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_sm")

from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.chunk import ne_chunk
from nltk.tokenize import RegexpTokenizer

stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from sklearn.decomposition import LatentDirichletAllocation as LDA
from spacy.matcher import Matcher

[nltk_data] Downloading package punkt to /Users/leon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/leon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/leon/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/leon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloadin

In [2]:
files = ['Data/Customer_review_data/Apex AD2600 Progressive-scan DVD player.txt',
         'Data/Customer_review_data/Canon G3.txt',
         'Data/Customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt',
         'Data/Customer_review_data/Nikon coolpix 4300.txt',
         'Data/Customer_review_data/Nokia 6610.txt',
         'Data/CustomerReviews-3_domains/Computer.txt',
         'Data/CustomerReviews-3_domains/Router.txt',
         'Data/CustomerReviews-3_domains/Speaker.txt',
         'Data/Reviews-9-products/Canon PowerShot SD500.txt',
         'Data/Reviews-9-products/Canon S100.txt',
         'Data/Reviews-9-products/Diaper Champ.txt',
         'Data/Reviews-9-products/Hitachi router.txt',
         'Data/Reviews-9-products/ipod.txt',
         'Data/Reviews-9-products/Linksys Router.txt',
         'Data/Reviews-9-products/MicroMP3.txt',
         'Data/Reviews-9-products/Nokia 6600.txt',
         'Data/Reviews-9-products/norton.txt']

In [49]:
def pre_process_review(reviews):

    processed_reviews = []
    title_switch = False
    title = ''

    for review in reviews:
        if review.startswith('[t]'):
            title = review[3:]
            title_switch = True
        elif title_switch:
            appended_review = review + title 
            processed_reviews.append(appended_review)
            title_switch = False
            title = ''
        else:
            processed_reviews.append(review)

    return processed_reviews
    


def read_file(file_path):
    tagged_reviews = []
    
    with open(file_path, 'r') as file:
        text = file.read()
        reviews = text.strip().split('\n')

        if reviews[0] == '*' * 77:
            reviews = reviews[11:]

        reviews = pre_process_review(reviews)
        
        for review in reviews:
            parts = review.split('##')
            
            if len(parts) > 1:
                tags = parts[0].strip().split(',')
                content = parts[1].strip() 
            else:
                tags = []
                content = parts
                
            tagged_reviews.append({'Tags': tags, 'Review': content})

        df = pd.DataFrame(tagged_reviews)
        df.attrs['title'] = file_path.split('/')[-1]

        return df
            

df = read_file(files[1])

In [50]:
df['Tokenised_Review'] = df['Review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x) 
df['Tokenised_Review'] = df['Tokenised_Review'].apply(lambda review: word_tokenize(review))
df['Filtered_Review'] = df['Tokenised_Review'].apply(lambda tokens: [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words])
df['Filtered_Review_String'] = df['Filtered_Review'].apply(lambda tokens: ' '.join(tokens))
df['Lemmatised_Review_String'] = df['Filtered_Review_String'].apply(lambda review_string: " ".join([token.lemma_ for token in nlp(review_string)]))
df['Lemmatised_Tokenised_Filtered_Review'] = df['Filtered_Review_String'].apply(lambda review: word_tokenize(review))
df

Unnamed: 0,Tags,Review,Tokenised_Review,Filtered_Review,Filtered_Review_String,Lemmatised_Review_String,Lemmatised_Tokenised_Filtered_Review
0,[canon powershot g3[+3]],i recently purchased the canon powershot g3 an...,"[i, recently, purchased, the, canon, powershot...","[recently, purchased, canon, powershot, extrem...",recently purchased canon powershot extremely s...,recently purchase canon powershot extremely sa...,"[recently, purchased, canon, powershot, extrem..."
1,[use[+2]],"the camera is very easy to use , in fact on a ...","[the, camera, is, very, easy, to, use, ,, in, ...","[camera, easy, use, fact, recent, trip, past, ...",camera easy use fact recent trip past week ask...,camera easy use fact recent trip past week ask...,"[camera, easy, use, fact, recent, trip, past, ..."
2,[],"after i took their picture with their camera ,...","[after, i, took, their, picture, with, their, ...","[took, picture, camera, offered, take, picture...",took picture camera offered take picture us,take picture camera offer take picture we,"[took, picture, camera, offered, take, picture..."
3,[],"i just told them , press halfway , wait for th...","[i, just, told, them, ,, press, halfway, ,, wa...","[told, press, halfway, wait, box, turn, green,...",told press halfway wait box turn green press r...,tell press halfway wait box turn green press r...,"[told, press, halfway, wait, box, turn, green,..."
4,[picture[+2]],they fired away and the picture turned out qui...,"[they, fired, away, and, the, picture, turned,...","[fired, away, picture, turned, quite, nicely, ...",fired away picture turned quite nicely picture...,fire away picture turn quite nicely picture th...,"[fired, away, picture, turned, quite, nicely, ..."
...,...,...,...,...,...,...,...
592,[camera[+3]],"even with these shortcomings , i still think i...","[even, with, these, shortcomings, ,, i, still,...","[even, shortcomings, still, think, best, digit...",even shortcomings still think best digital cam...,even shortcoming still think good digital came...,"[even, shortcomings, still, think, best, digit..."
593,[camera[+3]],definetely a great camera . great camera but g...,"[definetely, a, great, camera, ., great, camer...","[definetely, great, camera, great, camera, less]",definetely great camera great camera less,definetely great camera great camera less,"[definetely, great, camera, great, camera, less]"
594,"[quality[+2], lens[+2]]",proven canon built quality and lens .,"[proven, canon, built, quality, and, lens, .]","[proven, canon, built, quality, lens]",proven canon built quality lens,prove canon build quality len,"[proven, canon, built, quality, lens]"
595,[feel[+2]],feels solid in hand .,"[feels, solid, in, hand, .]","[feels, solid, hand]",feels solid hand,feel solid hand,"[feels, solid, hand]"


In [41]:
all_words = [word for review in df['Filtered_Review'] for word in review]
freq_dist = FreqDist(all_words)

def display_freq_dist(freq_dist):
    top_items = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)[:20]
    words, frequencies = zip(*top_items)
    plt.figure(figsize=(6, 3))  
    plt.bar(words, frequencies, color='skyblue')  
    plt.xlabel('Words') 
    plt.ylabel('Frequency') 
    plt.title('Top Words Frequency Distribution')  
    plt.xticks(rotation=45) 
    plt.show()

# display_freq_dist(freq_dist)

In [99]:
def k_means(string_list):

    tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.85, min_df=2)
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_list)
    
    km = KMeans(n_clusters=5, n_init=10)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names_out()
    
    for i in range(num_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]  # Get top 10 terms for each cluster
        print(f"Cluster {i}: {top_terms}")
    
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    # Get the cluster labels for each data point
    cluster_labels = km.labels_
    
    plt.figure(figsize=(8, 4))  # Set figure size
    
    # Scatter plot of the reduced data, colored by cluster labels
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.6)
    
    # Adding labels for axes
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    
    # Title of the plot
    plt.title('2D Visualization of K-Means Clusters')
    
    # Display the plot
    print('\n')
    plt.show()


# k_means(df['Lemmatised_Review_String'])

In [100]:
def POS_Noun_Tagging(string_list):
    
    reviews = string_list.tolist()
    features = []
    
    for review in reviews:
        tokens = word_tokenize(review)
        tagged = pos_tag(tokens)
        # Extracts nouns from POS tagged text as nouns likely features names
        features.extend([word.lower() for word, tag in tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']])
    
    feature_counts = Counter(features)
    common_features = feature_counts.most_common(15)
    print(common_features)

# POS_Noun_Tagging(df['Lemmatised_Review_String'])

In [101]:
def Noun_Phrase_Chuncking(string_list):

    all_noun_phrases = []
    
    for review in string_list:
        doc = nlp(review)
        noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
        all_noun_phrases.extend(noun_phrases)
    
    # Count the occurrences of each noun phrase
    from collections import Counter
    phrase_counts = Counter(all_noun_phrases)
    
    # Display most common noun phrases
    common_phrases = phrase_counts.most_common(15)
    print(common_phrases)
    
# Noun_Phrase_Chuncking(df['Lemmatised_Review_String'])

In [102]:
def LDA_Model(tokenised_reviews):
    
    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(tokenised_reviews)
    
    # Convert dictionary to a bag of words corpus
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Number of topics
    num_topics = 5
    
    # Generate LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100, update_every=1, passes=10, alpha='auto')
    
    # Print the topics
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(topic)
        

# LDA_Model(df['Lemmatised_Tokenised_Filtered_Review'])

In [103]:
def LDA_Model_2(string_reviews):
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_reviews)
    
    num_topics = 5
    lda = LDA(n_components=num_topics)
    lda.fit_transform(tfidf_matrix)
    
    # Explore the topics
    terms = tfidf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic #{topic_idx+1}:")
        print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]]))
        print('\n')

# LDA_Model_2(df['Lemmatised_Review_String'])

In [111]:
def POS_Chuck_Parser(review_data):

    review = review_data['Review']
    tags = review_data['Tags']
    doc = nlp(review)
    
    # Initialize a list to hold our extracted phrases
    extracted_phrases = []
    
    # Iterate over tokens in the doc
    for token in doc:
        # Look for an adverb modifying an adjective and check the adjective doesn't have a noun child
        if token.pos_ == "ADV" and token.head.pos_ == "ADJ":
            is_adj_modified = False
            for child in token.head.children:
                if child.dep_ in ["attr", "dobj", "pobj"]:  # The adjective is modifying a noun
                    is_adj_modified = True
                    break
            if not is_adj_modified:
                # Capture the adverb-adjective pair "rather heavy"
                extracted_phrases.append(token.text + " " + token.head.text)
    
        # Look for an adjective modifying a noun and check if it's in a prepositional phrase
        if token.pos_ == "ADJ" and token.head.pos_ in ["NOUN", "PROPN"]:
            is_in_prep_phrase = False
            for ancestor in token.head.ancestors:
                if ancestor.dep_ == "prep":
                    is_in_prep_phrase = True
                    break
            if not is_in_prep_phrase:
                # Capture the adjective-noun pair "great camera"
                extracted_phrases.append(token.text + " " + token.head.text)

    print(tags)
    print(review)
    print(extracted_phrases)

    return review, extracted_phrases


# 641/118/117/115/110/107
# review, extracted_phrases = POS_Chuck_Parser(df.iloc[129])

In [112]:
def get_sentiment(word):
    synsets = list(swn.senti_synsets(word))
    if synsets:
        return synsets[0].pos_score(), synsets[0].neg_score()
    else:
        return 0, 0

# get_sentiment('easy')

In [113]:
def disambiguate_word_sense(sentence, word):
    # Use Lesk algorithm for WSD
    sense = lesk(nltk.word_tokenize(sentence), word)
    if not sense:
        return None
    
    # Get sentiment scores
    senti_synset = swn.senti_synset(sense.name())
    return {
        'word': word,
        'synset_name': sense.name(),
        'definition': sense.definition(),
        'examples': sense.examples(),
        'positivity_score': senti_synset.pos_score(),
        'negativity_score': senti_synset.neg_score(),
        'objectivity_score': senti_synset.obj_score()
    }


review = df.iloc[641]

for word in review['Filtered_Review']:
    disambiguated_sense = disambiguate_word_sense(review['Review'], word)
    # print(disambiguated_sense)
    # print('\n')

In [114]:
def preprocess_and_ner(tokens):
    
    tagged = pos_tag(tokens)
    named_entities = ne_chunk(tagged)
    return named_entities


tokenised_text = df['Filtered_Review'].iloc[106]
named_entities = preprocess_and_ner(tokenised_text)
# print(named_entities)

In [115]:
def tf_idf(reviews):
    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Transform the reviews into a TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    
    # Extract the feature names/terms from the TF-IDF Vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Calculate the average TF-IDF score for each term across all documents
    scores = tfidf_matrix.mean(axis=0)
    term_scores = {feature_names[col]: scores[0, col] for col in range(scores.shape[1])}
    
    # Sort the terms by their average TF-IDF score in descending order
    sorted_term_scores = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Optionally: Display the top 10 terms with the highest average TF-IDF scores
    print("Top 15 terms by average TF-IDF score:")
    for term, score in sorted_term_scores[:15]:
        print(f"Term: {term}, Score: {round(score, 4)}")
    
    # Calculate cosine similarity among the documents using the TF-IDF matrix
    cos_sim_matrix = cosine_similarity(tfidf_matrix)
    
    # return tfidf_matrix, feature_names, cos_sim_matrix

# tf_idf(df['Filtered_Review_String'])

In [136]:

matcher = Matcher(nlp.vocab)

# Define custom patterns
patterns = [
    [{"POS": "ADJ"}, {"POS": "NOUN", "OP": "+"}],  # Adjective followed by one or more nouns
    [{"POS": "NOUN", "OP": "+"}, {"LOWER": "mode"}]  # One or more nouns followed by "mode"
]
matcher.add("CUSTOM_PATTERNS", patterns)


def POS_Chuck_Parser_Matcher(review):
    doc = nlp(review)
    matches = matcher(doc)

    extracted_phrases = []
    for match_id, start, end in matches:
        span = doc[start:end]
        extracted_phrases.append(span.text)

    print(extracted_phrases)


index = 109
print(df['Tags'].iloc[index])
print(df['Review'].iloc[index])
print('\n')
POS_Chuck_Parser_Matcher(df['Review'].iloc[index])
POS_Chuck_Parser_Matcher(df['Filtered_Review_String'].iloc[index])
POS_Chuck_Parser_Matcher(df['Lemmatised_Review_String'].iloc[index])

['camera[+2]']
this is a great camera for you !


['great camera']
['great camera']
['great camera']
