# PCA Creation with Clustering

Create PCA graphs for visual analysis of the POLAR embeddings together with cluster creation.

## 1 Import Data

### 1.1 Import Packages

In [None]:
import gensim
from numpy import linalg
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
import time
from random import shuffle
import sys
import nltk 
from nltk.corpus import wordnet 
import gc
from collections import defaultdict
import random
import json
import os
import pandas as pd

import plotly
import numpy as np
import plotly.graph_objs as go
from sklearn.decomposition import PCA

from sklearn.cluster import DBSCAN

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

from gensim.test.utils import datapath

### 1.2 Select POLAR Embeddings

In [None]:
#import the business entity and common word embedding you want to analyze via PCA
common_df = pd.read_csv('/Users/stjepankusenic/POLAR_WEBE/data/processed/POLAR-Reddit-org-antonyms-common2.csv')
company_df = pd.read_csv('/Users/stjepankusenic/POLAR_WEBE/data/processed/POLAR-Reddit-org-antonyms-inter.csv')

new_df=pd.concat([common_df, company_df], ignore_index=True)

## 2 PCA for 2D

### 2.1 Create Clusters

In [None]:
df_cluster=new_df.loc[:,new_df.columns!='Unnamed: 0']
#change parameters according to preference
#high eps low samples only clusters common
dbscan = DBSCAN(metric='cosine', eps=0.3, min_samples=2)
cluster_labels = dbscan.fit_predict(df_cluster)
#look at the clusters there are
display(cluster_labels)

### 2.2 Apply PCA 

In [None]:
#Apply the PCA algorithm to the data
two_dim = PCA(random_state=0).fit_transform(df_cluster)[:,:2]
df_cluster[['two_dim1','two_dim2']]=two_dim.tolist()
df_cluster['cluster']=cluster_labels
df_cluster['Unnamed: 0']=new_df['Unnamed: 0']

### 2.3 Create PCA Graph

In [None]:
#We write a function that plots our PCA plot, we adapted a function from towards datascience https://towardsdatascience.com/visualizing-word-embedding-with-pca-and-t-sne-961a692509f5 30.05.2022
def display_pca_scatterplot_2D(model):
    data = []
    for i in range(-1,model['cluster'].max()+1):
        
        word_vectors = model.loc[model['cluster']==i]
        scat_text = word_vectors['Unnamed: 0']
        two_dim = word_vectors[['two_dim1','two_dim2']].to_numpy()
        trace = go.Scatter(
            x = two_dim[:,0], 
            y = two_dim[:,1],  
            text = scat_text[:],
            name = 'Cluster'+str(i+2),
            textposition = "top center",
            textfont_size = 10,#20
            mode = 'markers+text',
            marker = {
                'size': 10,#10
                'opacity': 0.8,
                'color': i
            }
        )      
        data.append(trace)

    # Configure the layout of the graph

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,#1200
        height = 500 #700
        )

    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

In [None]:
#here we display the plot for the entities we selected at the top
display_pca_scatterplot_2D(df_cluster[:])

## 3 PCA for 3D

### 3.1 Create Clusters

In [None]:
df_cluster=new_df.loc[:,new_df.columns!='Unnamed: 0']
#change parameters according to preference
#high eps low samples only clusters common
dbscan = DBSCAN(metric='cosine', eps=0.3, min_samples=2)
cluster_labels = dbscan.fit_predict(df_cluster)
#look at the clusters there are
display(cluster_labels)

### 3.2 Apply PCA 

In [None]:
#Apply the PCA algorithm to the data
three_dim = PCA(random_state=0).fit_transform(df_cluster)[:,:3]
df_cluster[['three_dim1','three_dim2','three_dim3']]=three_dim.tolist()
df_cluster['cluster']=cluster_labels
df_cluster['Unnamed: 0']=new_df['Unnamed: 0']

### 3.3 Create PCA Graph

In [None]:
#We write a function that plots our PCA plot, we adapted a function from towards datascience https://towardsdatascience.com/visualizing-word-embedding-with-pca-and-t-sne-961a692509f5 30.05.2022
def display_pca_scatterplot_3D(model):
    data = []
    for i in range(-1,model['cluster'].max()+1):
        
        word_vectors = model.loc[model['cluster']==i]
        scat_text = word_vectors['Unnamed: 0']
        three_dim = word_vectors[['three_dim1','three_dim2','three_dim3']].to_numpy()
        trace = go.Scatter3d(
            x = three_dim[:,0], 
            y = three_dim[:,1],
            z = three_dim[:,2],
            text = scat_text[:],
            name = 'Cluster'+str(i+2),
            textposition = "top center",
            textfont_size = 10,#20
            mode = 'markers+text',
            marker = {
                'size': 10,#10
                'opacity': 0.8,
                'color': i
            }
        )      
        data.append(trace)

    # Configure the layout of the graph

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 700
        )

    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

In [None]:
#here we display the plot for the entities we selected at the top
display_pca_scatterplot_3D(df_cluster[:])