In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import spacy
import re
import warnings

warnings.filterwarnings('ignore', message='Discarded redundant search for Synset')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_sm")

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
Word2Vec_corpus = api.load('text8') 
Word2Vec_model = Word2Vec(Word2Vec_corpus) 
glove_model = api.load("glove-twitter-25") 

from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.chunk import ne_chunk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.utils import deaccent
from sklearn.decomposition import LatentDirichletAllocation as LDA
from spacy.matcher import Matcher

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/leon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/leon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/leon/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/leon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloadin



In [None]:
files = ['Data/Customer_review_data/Apex AD2600 Progressive-scan DVD player.txt',
         'Data/Customer_review_data/Canon G3.txt',
         'Data/Customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt',
         'Data/Customer_review_data/Nikon coolpix 4300.txt',
         'Data/Customer_review_data/Nokia 6610.txt',
         'Data/CustomerReviews-3_domains/Computer.txt',
         'Data/CustomerReviews-3_domains/Router.txt',
         'Data/CustomerReviews-3_domains/Speaker.txt',
         'Data/Reviews-9-products/Canon PowerShot SD500.txt',
         'Data/Reviews-9-products/Canon S100.txt',
         'Data/Reviews-9-products/Diaper Champ.txt',
         'Data/Reviews-9-products/Hitachi router.txt',
         'Data/Reviews-9-products/ipod.txt',
         'Data/Reviews-9-products/Linksys Router.txt',
         'Data/Reviews-9-products/MicroMP3.txt',
         'Data/Reviews-9-products/Nokia 6600.txt',
         'Data/Reviews-9-products/norton.txt']

In [None]:
def pre_process_review(reviews):

    processed_reviews = []
    title_switch = False
    title = ''

    for review in reviews:
        if review.startswith('[t]'):
            title = review[3:]
            title_switch = True
        elif title_switch:
            appended_review = review + title 
            processed_reviews.append(appended_review)
            title_switch = False
            title = ''
        else:
            processed_reviews.append(review)

    return processed_reviews



def preserve_compound_phrases(text):
    doc = nlp(text)
    processed_tokens = []
    
    for token in doc:
        if token.dep_ in ('compound', 'amod') and token.head.pos_ == 'NOUN':
            compound_phrase = token.text + "_" + token.head.text
            if compound_phrase not in processed_tokens:
                processed_tokens.append(compound_phrase)
        elif token.pos_ == 'NOUN' and any(child.dep_ == 'compound' for child in token.children):
            continue
        else:
            processed_tokens.append(token.text)
    
    return processed_tokens
    


def read_file(file_path):
    tagged_reviews = []
    
    with open(file_path, 'r') as file:
        text = file.read()
        reviews = text.strip().split('\n')

        if reviews[0] == '*' * 77:
            reviews = reviews[11:]

        reviews = pre_process_review(reviews)
        
        for review in reviews:
            parts = review.split('##')
            
            if len(parts) > 1:
                tags = parts[0].strip().split(',')
                content = parts[1].strip() 
            else:
                tags = []
                content = parts
                
            tagged_reviews.append({'Tags': tags, 'Review': content})

        df = pd.DataFrame(tagged_reviews)
        df.attrs['title'] = file_path.split('/')[-1]

        return df
        

df = read_file(files[0])

In [None]:
df['Tokenised_Review'] = df['Review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x) 
df['Tokenised_Review'] = df['Tokenised_Review'].apply(lambda review: preserve_compound_phrases(review))
df['Filtered_Review'] = df['Tokenised_Review'].apply(lambda tokens: [token.lower() for token in tokens if ("_" in token) or (token.isalpha() and token.lower() not in stop_words)])
df['Filtered_Review_String'] = df['Filtered_Review'].apply(lambda tokens: ' '.join(tokens))
df['Lemmatised_Review_String'] = df['Filtered_Review_String'].apply(lambda review_string: " ".join([token.lemma_ for token in nlp(review_string)]))
df['Lemmatised_Tokenised_Filtered_Review'] = df['Filtered_Review_String'].apply(lambda review: word_tokenize(review))
df['Stemmed_Review'] = df['Lemmatised_Tokenised_Filtered_Review'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
df['Stemmed_Review_String'] = df['Stemmed_Review'].apply(lambda tokens: ' '.join(tokens))
# df.head(10)

In [None]:
all_words = [word for review in df['Filtered_Review'] for word in review]
freq_dist = FreqDist(all_words)

def display_freq_dist(freq_dist):
    top_items = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)[:20]
    words, frequencies = zip(*top_items)
    plt.figure(figsize=(6, 3))  
    plt.bar(words, frequencies, color='skyblue')  
    plt.xlabel('Words') 
    plt.ylabel('Frequency') 
    plt.title('Top Words Frequency Distribution')  
    plt.xticks(rotation=45) 
    plt.show()

# display_freq_dist(freq_dist)

In [None]:
def k_means(string_list):

    num_clusters = 3

    tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.85, min_df=2)
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_list)
    
    km = KMeans(n_clusters=num_clusters, n_init=10)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names_out()
    
    for i in range(num_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]  # Get top 10 terms for each cluster
        print(f"Cluster {i}: {top_terms}")
    
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())
    
    # Get the cluster labels for each data point
    cluster_labels = km.labels_
    
    plt.figure(figsize=(8, 4))  # Set figure size
    
    # Scatter plot of the reduced data, colored by cluster labels
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.6)
    
    # Adding labels for axes
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    
    # Title of the plot
    plt.title('2D Visualization of K-Means Clusters')
    
    # Display the plot
    print('\n')
    plt.show()


# k_means(df['Filtered_Review_String'])
k_means(df['Lemmatised_Review_String'])
# k_means(df['Stemmed_Review_String'])

In [None]:
def POS_Noun_Tagging(string_list):
    
    reviews = string_list.tolist()
    features = []
    
    for review in reviews:
        tokens = word_tokenize(review)
        tagged = pos_tag(tokens)
        # Extracts nouns from POS tagged text as nouns likely features names
        features.extend([word.lower() for word, tag in tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']])
    
    feature_counts = Counter(features)
    common_features = feature_counts.most_common(15)
    print(common_features)



# POS_Noun_Tagging(df['Filtered_Review_String'])
# print('\n')
POS_Noun_Tagging(df['Lemmatised_Review_String'])
# print('\n')
# POS_Noun_Tagging(df['Stemmed_Review_String'])

In [None]:
def Noun_Phrase_Chuncking(string_list):

    all_noun_phrases = []
    
    for review in string_list:
        doc = nlp(review)
        noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
        all_noun_phrases.extend(noun_phrases)
    
    # Count the occurrences of each noun phrase
    from collections import Counter
    phrase_counts = Counter(all_noun_phrases).most_common(50)
    
    # Display most common noun phrases
    common_phrases = phrase_counts
    print(common_phrases)
    
    return common_phrases
    



# Noun_Phrase_Chuncking(df['Filtered_Review_String'])
# print('\n')
noun_chucked_phrases = Noun_Phrase_Chuncking(df['Lemmatised_Review_String'])
# print('\n')
# Noun_Phrase_Chuncking(df['Stemmed_Review_String'])

In [None]:
def is_concrete_noun(word):
    """Enhanced check if a noun is concrete based on its hypernyms and usage context."""
    concrete_clues = {'object', 'artifact', 'instrumentality', 'container', 'device'}
    synsets = wn.synsets(word, pos=wn.NOUN)
    for synset in synsets:
        for hyper in synset.closure(lambda s: s.hypernyms()):
            if concrete_clues.intersection(set(hyper.lemma_names())):
                return True
    return False


def context_based_filter(noun_tuples):
    """Filter nouns based on enhanced concrete checks and contextual usage."""
    filtered_features = []
    for noun, count in noun_tuples:
        if is_concrete_noun(noun):
            filtered_features.append((noun, count))
    return filtered_features



potential_nouns = context_based_filter(noun_chucked_phrases)
potential_nouns

In [None]:
Word2Vec_words = Word2Vec_model.wv.most_similar('dvds')
glove_words = glove_model.wv.most_similar('dvds')

print(Word2Vec_words)
print('\n')
print(glove_words)

In [None]:

def semantic_similarity_check(noun, model, seed_features):
    noun_vector = average_feature_vector([noun], model, model.vector_size)
    
    # Check if noun_vector is all zeros which indicates noun was not in the model's vocabulary
    if np.all(noun_vector == 0):
        return False
    
    max_similarity = 0
    for feature in seed_features:
        feature_vector = average_feature_vector([feature], model, model.vector_size)
        
        # Check if feature_vector is all zeros
        if np.all(feature_vector == 0):
            continue
        
        # Compute cosine similarity, safely handling division by zero
        denominator = (np.linalg.norm(noun_vector) * np.linalg.norm(feature_vector))
        if denominator == 0:
            continue  # Avoid division by zero
        
        similarity = np.dot(noun_vector, feature_vector) / denominator
        max_similarity = max(max_similarity, similarity)
    
    return max_similarity > 0.1  # Adjust threshold as necessary


# Example: dynamically selecting seed features that are present in the model
model_vocab = set(model.wv.index_to_key)
seed_features = [feature for feature in ['player', 'dvd', 'button'] if feature in model_vocab]

# Now proceed with your filtered nouns calculation
filtered_nouns = [noun for noun in potential_nouns if semantic_similarity_check(noun, Word2Vec_model, seed_features)]

filtered_nouns

In [None]:
def LDA_Model(tokenised_reviews):
    
    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(tokenised_reviews)
    
    # Convert dictionary to a bag of words corpus
    corpus = [dictionary.doc2bow(text) for text in tokenised_reviews]
    
    # Number of topics
    num_topics = 5
    
    # Generate LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100, update_every=1, passes=10, alpha='auto')
    
    # Print the topics
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(topic)
        

# LDA_Model(df['Stemmed_Review'])
# print('\n')
# LDA_Model(df['Filtered_Review'])
# print('\n')
# LDA_Model(df['Lemmatised_Tokenised_Filtered_Review'])

In [175]:
def LDA_Model_2(string_reviews):
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_reviews)
    
    num_topics = 5
    lda = LDA(n_components=num_topics)
    lda.fit_transform(tfidf_matrix)
    
    # Explore the topics
    terms = tfidf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic #{topic_idx+1}:")
        print(" ".join([terms[i] for i in topic.argsort()[:-10 - 1:-1]]))
        print('\n')

# LDA_Model_2(df['Lemmatised_Review_String'])

In [125]:
"""
Getting the sentiment of multiword phrases using SentiWordNet (SWN) through the NLTK library requires a more nuanced approach compared to single words because SentiWordNet does not directly provide sentiment scores for phrases. Instead, you can average the sentiment scores of individual words in the phrase, adjust the methodology to consider the phrase's context, or use compound term lookup techniques where applicable. Here’s an approach to approximate sentiment for multiword phrases:

### Step 1: Handle Compound Terms

Some compound terms might be recognized by SentiWordNet if connected by underscores (e.g., "well_known"). Before splitting the phrase into individual words, check if the compound term has an entry in SentiWordNet.

### Step 2: Average Sentiment Scores of Individual Words

For phrases not recognized as compound terms, calculate the average sentiment scores of the individual words. Consideration of part-of-speech tags can enhance accuracy, but for simplicity, this example ignores POS tags.

### Note:

- This method assumes equal weighting of words in calculating the average sentiment, which might not reflect the actual sentiment conveyed by the phrase.
- Handling negations and intensifiers (e.g., "not" in "not good", "very" in "very good") requires more sophisticated logic, as they can significantly alter the sentiment.
- Advanced models designed for sentiment analysis at the sentence or document level (like BERT-based models) may provide more accurate sentiment assessments for phrases and sentences by considering the broader context.

This approach gives a basic approximation but is limited by the nuances of natural language. For more accurate sentiment analysis on phrases or sentences, consider using pre-trained sentiment analysis models or services.
"""


def get_word_sentiment(word):
    synsets = list(swn.senti_synsets(word))
    if synsets:
        return synsets[0].pos_score(), synsets[0].neg_score()
    else:
        return 0, 0


def get_phrase_sentiment(phrase):
    # Try the phrase directly (useful for compound terms recognized by SWN)
    pos_score, neg_score = get_word_sentiment(phrase.replace(" ", "_"))
    if pos_score or neg_score:
        return pos_score, neg_score
    
    # Split the phrase into individual words and average their sentiment scores
    words = phrase.split()
    total_pos, total_neg = 0, 0
    for word in words:
        pos, neg = get_word_sentiment(word)
        total_pos += pos
        total_neg += neg
    avg_pos = total_pos / len(words) if words else 0
    avg_neg = total_neg / len(words) if words else 0
    
    return avg_pos, avg_neg

# Example usage
phrase = 'not very well known'
pos_score, neg_score = get_phrase_sentiment(phrase)
print(f"Phrase: '{phrase}', Pos score: {pos_score}, Neg score: {neg_score}")



"""
This version attempts to address negations directly before a word and could be extended to consider intensifiers (like "very") by further
modifying the sentiment scores. For even more nuanced sentiment analysis, exploring deep learning models trained specifically for sentiment 
analysis is recommended.
"""


def penn_to_wn(tag):
    """Converts Penn Treebank tags to WordNet tags."""
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def get_word_sentiment(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(swn.senti_synsets(word, wn_tag))
        if synsets:
            return synsets[0].pos_score(), synsets[0].neg_score()
    return 0, 0

def adjust_scores_for_negation_and_intensifiers(scores, words, i):
    """Adjusts sentiment scores based on negations and intensifiers around the i-th word."""
    if i > 0 and words[i-1].lower() in ["not", "no"]:
        return -scores[0], -scores[1]  # Inverting the sentiment
    # Further adjustments for intensifiers (like "very") can be added here
    return scores

def get_phrase_sentiment(phrase):
    words = word_tokenize(phrase)
    tagged = pos_tag(words)
    total_pos, total_neg = 0, 0
    
    for i, (word, tag) in enumerate(tagged):
        scores = get_word_sentiment(word, tag)
        scores = adjust_scores_for_negation_and_intensifiers(scores, words, i)
        total_pos += scores[0]
        total_neg += scores[1]
    
    avg_pos = total_pos / len(words) if words else 0
    avg_neg = total_neg / len(words) if words else 0
    
    return avg_pos, avg_neg

# Example usage
phrase = 'not very well known'
pos_score, neg_score = get_phrase_sentiment(phrase)
print(f"Phrase: '{phrase}', Pos score: {pos_score}, Neg score: {neg_score}")

Phrase: 'not very well known', Pos score: 0.125, Neg score: 0.15625
Phrase: 'not very well known', Pos score: 0.03125, Neg score: 0.09375


In [96]:
def disambiguate_word_sense(sentence, word):
    # Use Lesk algorithm for WSD
    sense = lesk(nltk.word_tokenize(sentence), word)
    if not sense:
        return None
    
    # Get sentiment scores
    senti_synset = swn.senti_synset(sense.name())
    return {
        'word': word,
        'synset_name': sense.name(),
        'definition': sense.definition(),
        'examples': sense.examples(),
        'positivity_score': senti_synset.pos_score(),
        'negativity_score': senti_synset.neg_score(),
        'objectivity_score': senti_synset.obj_score()
    }


review = df.iloc[200]

for word in review['Filtered_Review']:
    disambiguated_sense = disambiguate_word_sense(review['Review'], word)
    # print(disambiguated_sense)
    # print('\n')

In [97]:
def preprocess_and_ner(tokens):
    
    tagged = pos_tag(tokens)
    named_entities = ne_chunk(tagged)
    return named_entities


tokenised_text = df['Filtered_Review'].iloc[106]
named_entities = preprocess_and_ner(tokenised_text)
# print(named_entities)

In [98]:
def tf_idf(reviews):
    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Transform the reviews into a TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    
    # Extract the feature names/terms from the TF-IDF Vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Calculate the average TF-IDF score for each term across all documents
    scores = tfidf_matrix.mean(axis=0)
    term_scores = {feature_names[col]: scores[0, col] for col in range(scores.shape[1])}
    
    # Sort the terms by their average TF-IDF score in descending order
    sorted_term_scores = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Optionally: Display the top 10 terms with the highest average TF-IDF scores
    print("Top 15 terms by average TF-IDF score:")
    for term, score in sorted_term_scores[:15]:
        print(f"Term: {term}, Score: {round(score, 4)}")
    
    # Calculate cosine similarity among the documents using the TF-IDF matrix
    cos_sim_matrix = cosine_similarity(tfidf_matrix)
    
    # return tfidf_matrix, feature_names, cos_sim_matrix

# tf_idf(df['Filtered_Review_String'])

In [99]:
matcher = Matcher(nlp.vocab)

# Define custom patterns
patterns = [
    [{"POS": "ADJ"}, {"POS": "NOUN", "OP": "+"}],  # Adjective followed by one or more nouns
    [{"POS": "NOUN", "OP": "+"}, {"LOWER": "mode"}]  # One or more nouns followed by "mode"
]
matcher.add("CUSTOM_PATTERNS", patterns)


def POS_Chuck_Parser_Matcher(review):
    doc = nlp(review)
    matches = matcher(doc)

    extracted_phrases = []
    for match_id, start, end in matches:
        span = doc[start:end]
        extracted_phrases.append(span.text)

    print(extracted_phrases)

In [113]:
def POS_Chuck_Parser(review):

    doc = nlp(review)
    
    # Initialize a list to hold our extracted phrases
    extracted_phrases = []
    
    # Iterate over tokens in the doc
    for token in doc:
        # Look for an adverb modifying an adjective and check the adjective doesn't have a noun child
        if token.pos_ == "ADV" and token.head.pos_ == "ADJ":
            is_adj_modified = False
            for child in token.head.children:
                if child.dep_ in ["attr", "dobj", "pobj"]:  # The adjective is modifying a noun
                    is_adj_modified = True
                    break
            if not is_adj_modified:
                # Capture the adverb-adjective pair "rather heavy"
                extracted_phrases.append(token.text + " " + token.head.text)
    
        # Look for an adjective modifying a noun and check if it's in a prepositional phrase
        if token.pos_ == "ADJ" and token.head.pos_ in ["NOUN", "PROPN"]:
            is_in_prep_phrase = False
            for ancestor in token.head.ancestors:
                if ancestor.dep_ == "prep":
                    is_in_prep_phrase = True
                    break
            if not is_in_prep_phrase:
                # Capture the adjective-noun pair "great camera"
                extracted_phrases.append(token.text + " " + token.head.text)

    print(extracted_phrases)

In [135]:
index = 24

print(df['Tags'].iloc[index])
print(df['Review'].iloc[index])
print('\n')

POS_Chuck_Parser(df['Review'].iloc[index])
POS_Chuck_Parser(df['Filtered_Review_String'].iloc[index])
POS_Chuck_Parser(df['Lemmatised_Review_String'].iloc[index])

print('\n')

POS_Chuck_Parser_Matcher(df['Review'].iloc[index])
POS_Chuck_Parser_Matcher(df['Filtered_Review_String'].iloc[index])
POS_Chuck_Parser_Matcher(df['Lemmatised_Review_String'].iloc[index])

['price[+2]']
for the price it is a well spent investment !


[]
['spent_investment investment']
['spent_investment investment']


[]
['spent_investment investment']
['spent_investment investment']
