# Sentiment Analysis using Clustering

## Preprocessing:

Load and Preprocess Data

In [20]:
import pandas as pd

# Load preprocessed reviews
df_processed = pd.read_csv('C:/Users/phili/PycharmProjects/BSUD_Task_2/Data/reviews_preprocessed.csv')

# Load VADER sentiment results
df_vader = pd.read_csv('C:/Users/phili/PycharmProjects/BSUD_Task_2/Data/reviews_VADER.csv')

# Combine DataFrames
df_combined = pd.concat([df_processed, df_vader[['vader_score', 'vader_label']]], axis=1)

# Verify
print(df_combined.columns)


Index(['processed_reviews', 'vader_score', 'vader_label'], dtype='object')


Separate Reviews by Sentiment

In [21]:
# Separate positive and negative reviews based on vader label. This only includes the preprocessed version.
positive_reviews = df_combined[df_combined['vader_label'] == 'positive']['processed_reviews']
negative_reviews = df_combined[df_combined['vader_label'] == 'negative']['processed_reviews']

print(positive_reviews)

0     great music service audio high quality app eas...
1     ignore previous negative rating app super grea...
2     pop good spotify experience android 12 annoyin...
4               dear spotify song playlist shuffle play
6              love selection lyric provide song listen
7     extremely slow change storage external sd card...
8     great app good mp3 music app problem play song...
9     delete app following reason app fail business ...
12    amazon premium music family package good liste...
14                      hav music like super raise hand
15    improve ia recommend song find similar song go...
18            crash unable play stop work everytime use
19    know ad cost free use yes s right 3 ad straigh...
20    favorite platform listen music like subscribe ...
21                                 voice sweet hearable
Name: processed_reviews, dtype: object


## Feature Extraction

TF-IDF Vectorization: Convert the text data into numerical form for clustering

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')

# Positive reviews
X_positive = vectorizer.fit_transform(positive_reviews)

# Negative reviews
X_negative = vectorizer.fit_transform(negative_reviews)


Parameters:

**max_df=0.9:** Ignores terms that appear in more than 90% of the documents.
**min_df=2:** Ignores terms that appear in less than 2 documents.
**stop_words='english':** Removes common English stop words.

## Dimensionality Reduction

We'll reduce dimensions using Truncated Singular Value Decomposition (SVD).

In [38]:
from sklearn.decomposition import TruncatedSVD

n_components = 16 # must be <= n_features TODO: Make this dynamic

# Positive reviews
svd_positive = TruncatedSVD(n_components=n_components, random_state=42)
X_positive_reduced = svd_positive.fit_transform(X_positive)

# Negative reviews
svd_negative = TruncatedSVD(n_components=n_components, random_state=42)
X_negative_reduced = svd_negative.fit_transform(X_negative)


## The Clustering

We'll use K-Means clustering to group the reviews.

In [39]:
from sklearn.cluster import KMeans

# Number of clusters TODO: Check k
k = 5

# Positive reviews
kmeans_positive = KMeans(n_clusters=k, random_state=42)
labels_positive = kmeans_positive.fit_predict(X_positive_reduced)

# Negative reviews
kmeans_negative = KMeans(n_clusters=k, random_state=42)
labels_negative = kmeans_negative.fit_predict(X_negative_reduced)


## Cluster Interpretation

Assign Cluster Labels

In [40]:
# Positive reviews DataFrame
df_positive = df_combined[df_combined['vader_label'] == 'positive'].copy()
df_positive['cluster'] = labels_positive

# Negative reviews DataFrame
df_negative = df_combined[df_combined['vader_label'] == 'negative'].copy()
df_negative['cluster'] = labels_negative


Extract Top Terms per Cluster. Compute TF-IDF score for each term in a cluster and extract its top terms.

In [41]:
import numpy as np


# Function to get top terms per cluster
def get_top_terms_per_cluster(tfidf_matrix, labels, vectorizer, n_terms=10):
    terms = vectorizer.get_feature_names_out()
    cluster_terms = {}

    for cluster_num in np.unique(labels):
        cluster_indices = np.where(labels == cluster_num)
        mean_tfidf = tfidf_matrix[cluster_indices].mean(axis=0)
        top_indices = np.argsort(mean_tfidf.A1)[::-1][:n_terms]
        top_terms = [terms[i] for i in top_indices]
        cluster_terms[cluster_num] = top_terms

    return cluster_terms


Display top terms

In [42]:
# Top terms in positive clusters
top_terms_positive = get_top_terms_per_cluster(X_positive, labels_positive, vectorizer)

print("Top Terms in Positive Review Clusters:")
for cluster_num, terms in top_terms_positive.items():
    print(f"\nCluster {cluster_num}: {', '.join(terms)}")




IndexError: index 17 is out of bounds for axis 0 with size 16