In [120]:
# Core Libraries
import pandas as pd
import numpy as np
import gensim
import sklearn
import plotly
import pickle

# Natural Language Processing
import nltk
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer

# Machine Learning and Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score, silhouette_samples

# Visualization
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

# Miscellaneous
from ast import literal_eval
from scipy.spatial import distance

# Ensure gensim version is 4.3.2 and install openpyxl
!pip install gensim==4.3.2
!pip install openpyxl




In [116]:
vad = pd.read_excel('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/vad.xlsx', index_col=0, engine='openpyxl')
word_level_df = pd.read_csv('/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/word_level_df.csv', encoding='utf-8', index_col=0)
sentence_level_df = pd.read_csv('/work/GitHub_ML_Deepnote/Machine Learning/2. Cleaning & Pre-processing/sentence_level_df.csv',  encoding='utf-8', index_col = 0)

In [121]:
# Extract lemmas from the 'lemmas' column in word_level_df
corpus_words = [literal_eval(lemmas) for lemmas in word_level_df['lemmas'].values]

# Save corpus_words as a pickle file
with open('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/corpus_words.pkl', 'wb') as file:
    pickle.dump(corpus_words, file)

# Flatten the list of lemmas
corpus_flat_lemmas = [lemma for sublist in corpus_words for lemma in sublist]

In [62]:
corpus_words = []
corpus_emo = []
corpus_noVAD = [] 
#another way we can overview what words haven't been matched with a VAD score

for i in corpus_flat_lemmas:
    if i in vad.index:
        corpus_emo.append(vad.loc[i])
        corpus_words.append(i)
    else:
        corpus_noVAD.append(i)

In [63]:
corpus_vad = pd.DataFrame(corpus_emo, index = corpus_words)

### Utalising word2vec - ignore this bit for now and scroll to visualisations

In [None]:
Trump_model = gensim.models.Word2Vec(trump_lemmas, min_count= 20, vector_size = 300)
Trump_model.wv['american']
Trump_vocab = Trump_model.wv.index_to_key
Trump_vectors = [Trump_model.wv[i] for i in Trump_vocab]

Trump_df = pd.DataFrame(Trump_vectors)
Trump_df.columns = [str(i) for i in Trump_df.columns]

pca = PCA(n_components = 3)
comps_1 = pca.fit_transform(Trump_vectors)
pc_df_1 = pd.DataFrame(data = comps_1, columns = ['Principal Component '+str(i) for i in range(1, comps_1.shape[1]+1)])
Trump_df = pd.concat([Trump_df, pc_df_1], axis = 1)
Trump_df.index = Trump_vocab

In [88]:
#Silhoutte method to identify optimal amount of clusters. This method is also indicating 4.

#range of clusters to test
range_n_clusters = range(2, 10)

# Converting dataframe to numpy array for clustering
X = np.array(Trump_df)

# Initialising list to store silhouette scores for each number of clusters
silhouette_scores = []

# Iterating over set range of clusters and calculating silhouette score
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init= 10)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

# Clustering our data to see what groups together

kmeans = KMeans(n_clusters=5, random_state=0, n_init= 10 ).fit(cluster_df)

#Adding relevant values to our dataframe
Trump_df['clusters_knn'] = [str(i) for i in kmeans.labels_]

#Viewing a single cluster to observe in greater detail
TCluster_0= Trump_df.loc[Trump_df['clusters_knn'] == '0']
TCluster_1= Trump_df.loc[Trump_df['clusters_knn'] == '1']
TCluster_2= Trump_df.loc[Trump_df['clusters_knn'] == '2']
TCluster_3= Trump_df.loc[Trump_df['clusters_knn'] == '3']
TCluster_4= Trump_df.loc[Trump_df['clusters_knn'] == '4']

NameError: name 'cluster_vectors' is not defined

### Visualisations currently..

In [117]:
X = corpus_vad[['valence', 'arousal', 'dominance']].values

# Specify the number of clusters
n_clusters = 6  # Adjust this number based on your preference

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
corpus_vad['Cluster'] = kmeans.fit_predict(X)

In [118]:
import plotly.express as px

# Sample data
corpus_vad_s = corpus_vad.sample(1000)

fig = px.scatter_3d(corpus_vad_s, x='valence', y='arousal', z='dominance', color='Cluster', hover_data=[corpus_vad_s.index],
                    color_discrete_sequence=['yellow', 'green', 'blue', 'red'])

fig.update_traces(marker=dict(size=5, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'))

# Hide the color bar
fig.update_layout(coloraxis_showscale=True)

fig.show()

In [112]:
# Saving the 3D scatter plot as an HTML file
fig.write_html('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/vad_3d_plot.html')

In [119]:
# Sample data
corpus_vad_s = corpus_vad.sample(1000)

# Plot 1: Valence vs Dominance
fig1 = px.scatter(corpus_vad_s, x='valence', y='dominance', color='Cluster', hover_data=[corpus_vad_s.index],
                  color_discrete_sequence=['yellow', 'green', 'blue', 'red'])

fig1.update_traces(marker=dict(size=5, line=dict(width=0)), selector=dict(mode='markers'))

# Plot 2: Arousal vs Valence
fig2 = px.scatter(corpus_vad_s, x='arousal', y='valence', color='Cluster', hover_data=[corpus_vad_s.index],
                  color_discrete_sequence=['yellow', 'green', 'blue', 'red'])

fig2.update_traces(marker=dict(size=5, line=dict(width=0)), selector=dict(mode='markers'))

# Plot 3: Arousal vs Dominance
fig3 = px.scatter(corpus_vad_s, x='arousal', y='dominance', color='Cluster', hover_data=[corpus_vad_s.index],
                  color_discrete_sequence=['yellow', 'green', 'blue', 'red'])

fig3.update_traces(marker=dict(size=5, line=dict(width=0)), selector=dict(mode='markers'))

# Show the plots
fig1.show()
fig2.show()
fig3.show()

In [111]:
# Save the plots as HTML files
fig1.write_html('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/vad_plot_valence_dominance.html')
fig2.write_html('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/vad_plot_arousal_valence.html')
fig3.write_html('/work/GitHub_ML_Deepnote/Machine Learning/3. VAD/vad_plot_arousal_dominance.html')

### feel like clusters like below will be possible if i apply word2vec model as basis again

In [None]:
T_words_cluster_0 = list(TCluster_0.index)

# Filter Trump_vad to only include words in TCluster_0
Trump_vad_cluster_0 = Trump_vad[Trump_vad.index.isin(T_words_cluster_0)]

# Create scatter plot of VAD visualizations for words in TCluster_0
fig = px.scatter_3d(Trump_vad_cluster_0, x='valence', y='arousal', z='dominance', 
                    color=Trump_vad_cluster_0.index, hover_data=[Trump_vad_cluster_0.index],
                    color_discrete_sequence=['red'])
fig.update_traces(marker=dict(size=5, line=dict(width=2, color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(title='Trump VAD Values for Cluster 0')
fig.show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f64215d6-debc-46bd-b273-63565459a66d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>