In [200]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

nltk.download('punkt') 
nltk.download('stopwords')
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

[nltk_data] Downloading package punkt to /Users/leon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/leon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [201]:
files = ['Data/Customer_review_data/Apex AD2600 Progressive-scan DVD player.txt',
         'Data/Customer_review_data/Canon G3.txt',
         'Data/Customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt',
         'Data/Customer_review_data/Nikon coolpix 4300.txt',
         'Data/Customer_review_data/Nokia 6610.txt',
         'Data/CustomerReviews-3_domains/Computer.txt',
         'Data/CustomerReviews-3_domains/Router.txt',
         'Data/CustomerReviews-3_domains/Speaker.txt',
         'Data/Reviews-9-products/Canon PowerShot SD500.txt',
         'Data/Reviews-9-products/Canon S100.txt',
         'Data/Reviews-9-products/Diaper Champ.txt',
         'Data/Reviews-9-products/Hitachi router.txt',
         'Data/Reviews-9-products/ipod.txt',
         'Data/Reviews-9-products/Linksys Router.txt',
         'Data/Reviews-9-products/MicroMP3.txt',
         'Data/Reviews-9-products/Nokia 6600.txt',
         'Data/Reviews-9-products/norton.txt']

In [202]:
def read_file(file_path):
    tagged_reviews = []
    
    with open(file_path, 'r') as file:
        text = file.read()
        reviews = text.strip().split('\n')

        if reviews[0] == '*****************************************************************************':
            reviews = reviews[11:]
        
        for review in reviews:
            parts = review.split('##')
            
            if len(parts) > 1:
                tags = parts[0].strip().split(',')
                content = parts[1].strip() 
            else:
                tags = []
                content = parts
                
            tagged_reviews.append({'Tags': tags, 'Review': content})

        df = pd.DataFrame(tagged_reviews)
        df.attrs['title'] = file_path.split('/')[-1]

        return df
            

df = read_file(files[0])

In [203]:
df['Tokenised_Review'] = df['Review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x) 
df['Tokenised_Review'] = df['Tokenised_Review'].apply(lambda review: word_tokenize(review))
df['Filtered_Review'] = df['Tokenised_Review'].apply(lambda tokens: [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words])
df['Filtered_Review_String'] = df['Filtered_Review'].apply(lambda tokens: ' '.join(tokens))


In [204]:
all_words = [word for review in df['Filtered_Review'] for word in review]
freq_dist = FreqDist(all_words)

def display_freq_dist(freq_dist):
    top_items = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)[:20]
    words, frequencies = zip(*top_items)
    plt.figure(figsize=(6, 3))  
    plt.bar(words, frequencies, color='skyblue')  
    plt.xlabel('Words') 
    plt.ylabel('Frequency') 
    plt.title('Top Words Frequency Distribution')  
    plt.xticks(rotation=45) 
    plt.show()

# display_freq_dist(freq_dist)

In [205]:
df

Unnamed: 0,Tags,Review,Tokenised_Review,Filtered_Review,Filtered_Review_String
0,[],[[t] troubleshooting ad-2500 and ad-2600 no pi...,"[[, t, ], troubleshooting, ad-2500, and, ad-26...","[troubleshooting, picture, scrolling]",troubleshooting picture scrolling
1,[],"repost from january 13 , 2004 with a better fi...","[repost, from, january, 13, ,, 2004, with, a, ...","[repost, january, better, fit, title]",repost january better fit title
2,[],does your apex dvd player only play dvd audio ...,"[does, your, apex, dvd, player, only, play, dv...","[apex, dvd, player, play, dvd, audio, without,...",apex dvd player play dvd audio without video
3,[],or does it play audio and video but scrolling ...,"[or, does, it, play, audio, and, video, but, s...","[play, audio, video, scrolling, black, white]",play audio video scrolling black white
4,[],before you try to return the player or waste h...,"[before, you, try, to, return, the, player, or...","[try, return, player, waste, hours, calling, a...",try return player waste hours calling apex tec...
...,...,...,...,...,...
834,[dvd player[+3]],i am really impressed by this dvd player .,"[i, am, really, impressed, by, this, dvd, play...","[really, impressed, dvd, player]",really impressed dvd player
835,[],"if it can fit in the drive bay , this dvd play...","[if, it, can, fit, in, the, drive, bay, ,, thi...","[fit, drive, bay, dvd, player, play]",fit drive bay dvd player play
836,"[play[+2], dvd[+2]]","for instance , i made several back-ups of my d...","[for, instance, ,, i, made, several, back-ups,...","[instance, made, several, dvd, movies, using, ...",instance made several dvd movies using w r w p...
837,[format[+3]],no matter the format .,"[no, matter, the, format, .]","[matter, format]",matter format


In [206]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.85, min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Filtered_Review_String'])

km = KMeans(n_clusters=5, n_init=10)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]  # Get top 10 terms for each cluster
    print(f"Cluster {i}: {top_terms}")

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# Get the cluster labels for each data point
cluster_labels = km.labels_

plt.figure(figsize=(8, 4))  # Set figure size

# Scatter plot of the reduced data, colored by cluster labels
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.6)

# Adding labels for axes
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

# Title of the plot
plt.title('2D Visualization of K-Means Clusters')

# Display the plot
plt.show()


Cluster 0: ['apex', 'bought', 'product', 'dvd', 'players', 'service', 'trying', 'amazon', 'two', 'years']
Cluster 1: ['one', 'get', 'great', 'buy', 'problems', 'good', 'remote', 'unit', 'price', 'amazon']
Cluster 2: ['picture', 'plays', 'sound', 'everything', 'thing', 'scrolling', 'hear', 'use', 'still', 'dvd']
Cluster 3: ['play', 'dvds', 'read', 'would', 'disc', 'dvd', 'movies', 'discs', 'player', 'wo']
Cluster 4: ['player', 'dvd', 'price', 'great', 'bought', 'first', 'best', 'apex', 'one', 'christmas']
