# Sentiment Analysis using Clustering

## Preprocessing:

Load and Preprocess Data

In [23]:
import pandas as pd

# Load preprocessed reviews
df_processed = pd.read_csv('Data/reviews_preprocessed.csv')

print(df_processed.head())


                                              Tokens     Label
0  great music service audio high quality app eas...  positive
1  ignore previous negative rating app super grea...  positive
2  pop good spotify experience android 12 annoyin...  positive
3                        buggy terrible use recently  negative
4            dear spotify song playlist shuffle play  negative


Separate Reviews by Sentiment

In [34]:
# Separate positive and negative reviews based on vader label. This only includes the preprocessed version.
positive_reviews = df_processed[df_processed["Label"] == 'positive']
neutral_reviews = df_processed[df_processed["Label"] == 'neutral']
negative_reviews = df_processed[df_processed["Label"] == 'negative']

print(neutral_reviews.head(10))

                                               Tokens    Label
5   player control disappear reason app restart fo...  neutral
7   extremely slow change storage external sd card...  neutral
17  listen download playlist offline point feature...  neutral
22                log acc try open log will open logo  neutral
30  option delete song album option download track...  neutral
61             dose play like song song look get star  neutral
70                                            long ad  neutral
73  great song selection amazing audio quality pro...  neutral
82  like spotify proplem ad shuffle 6 skip hour so...  neutral
85  good u want play song song yo listen instaed s...  neutral


## Feature Extraction

TF-IDF Vectorization: Convert the text data into numerical form for clustering

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')

# Positive reviews
X_positive = vectorizer.fit_transform(positive_reviews)

# Negative reviews
X_negative = vectorizer.fit_transform(negative_reviews)


ValueError: max_df corresponds to < documents than min_df

Parameters:

**max_df=0.9:** Ignores terms that appear in more than 90% of the documents.
**min_df=2:** Ignores terms that appear in less than 2 documents.
**stop_words='english':** Removes common English stop words.

## Dimensionality Reduction

We'll reduce dimensions using Truncated Singular Value Decomposition (SVD).

In [38]:
from sklearn.decomposition import TruncatedSVD

n_components = 16 # must be <= n_features TODO: Make this dynamic, maybe PCA

# Positive reviews
svd_positive = TruncatedSVD(n_components=n_components, random_state=42)
X_positive_reduced = svd_positive.fit_transform(X_positive)

# Negative reviews
svd_negative = TruncatedSVD(n_components=n_components, random_state=42)
X_negative_reduced = svd_negative.fit_transform(X_negative)


## The Clustering

We'll use K-Means clustering to group the reviews.

In [39]:
from sklearn.cluster import KMeans

# Number of clusters TODO: Check k
k = 5

# Positive reviews
kmeans_positive = KMeans(n_clusters=k, random_state=42)
labels_positive = kmeans_positive.fit_predict(X_positive_reduced)

# Negative reviews
kmeans_negative = KMeans(n_clusters=k, random_state=42)
labels_negative = kmeans_negative.fit_predict(X_negative_reduced)


## Cluster Interpretation

Assign Cluster Labels

In [40]:
# Positive reviews DataFrame
df_positive = df_combined[df_combined['vader_label'] == 'positive'].copy()
df_positive['cluster'] = labels_positive

# Negative reviews DataFrame
df_negative = df_combined[df_combined['vader_label'] == 'negative'].copy()
df_negative['cluster'] = labels_negative


Extract Top Terms per Cluster. Compute TF-IDF score for each term in a cluster and extract its top terms.

In [41]:
import numpy as np


# Function to get top terms per cluster
def get_top_terms_per_cluster(tfidf_matrix, labels, vectorizer, n_terms=10):
    terms = vectorizer.get_feature_names_out()
    cluster_terms = {}

    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)
        mean_tfidf = tfidf_matrix[cluster_indices].mean(axis=0)
        top_indices = np.argsort(mean_tfidf.A1)[::-1][:n_terms]
        top_terms = [terms[i] for i in top_indices]
        cluster_terms[cluster_num] = top_terms

    return cluster_terms


Display top terms

In [42]:
# Top terms in positive clusters
top_terms_positive = get_top_terms_per_cluster(X_positive, labels_positive, vectorizer)

print("Top Terms in Positive Review Clusters:")
for cluster_num, terms in top_terms_positive.items():
    print(f"\nCluster {cluster_num}: {', '.join(terms)}")


IndexError: index 17 is out of bounds for axis 0 with size 16