In [6]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [3]:
merged_files_filtered = pd.read_csv('./data/stress_urinary_incontinence.csv')

In [5]:
merged_files_filtered['DATE_RECEIVED'] = pd.to_datetime(merged_files_filtered['DATE_RECEIVED'])

# Perform sentiment analysis using the VADER lexicon
sia = SentimentIntensityAnalyzer()
merged_files_filtered['sentiment'] = merged_files_filtered['FOI_TEXT'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Create a year column based on the DATE_RECEIVED column
merged_files_filtered['year'] = merged_files_filtered['DATE_RECEIVED'].dt.year

# Group the data by year and sentiment and count the number of reviews in each group
grouped_data = merged_files_filtered.groupby(['year', 'sentiment'])['FOI_TEXT'].count().reset_index()

# Pivot the data to create a matrix with years as rows, sentiment as columns, and review counts as values
pivoted_data = grouped_data.pivot(index='year', columns='sentiment', values='FOI_TEXT').fillna(0)

In [None]:
# Perform K-Means clustering on the sentiment scores, with 3 clusters (negative, neutral, positive)
kmeans = KMeans(n_clusters=3, random_state=0).fit(merged_files_filtered[['sentiment']])

# Add a cluster column to the data
merged_files_filtered['cluster'] = kmeans.labels_

# Group the data by year and cluster and count the number of reviews in each group
grouped_data = merged_files_filtered.groupby(['year', 'cluster'])['FOI_TEXT'].count().reset_index()

# Pivot the data to create a matrix with years as rows, clusters as columns, and review counts as values
pivoted_data = grouped_data.pivot(index='year', columns='cluster', values='FOI_TEXT').fillna(0)
