# PCA with Clustering

Create PCA graphs with clusterings based on POLAR embeddings.

In [1]:
import gensim
from numpy import linalg
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
import time
from random import shuffle
import sys
import nltk 
from nltk.corpus import wordnet 
import gc
from collections import defaultdict
import random
import json
import os
import pandas as pd

import plotly
import numpy as np
import plotly.graph_objs as go
from sklearn.decomposition import PCA

from sklearn.cluster import DBSCAN

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

from gensim.test.utils import datapath

### Select Companies and POLAR Embeddings

In [2]:
common_df = pd.read_csv('/Users/stjepankusenic/POLAR_WEBE/data/processed/POLAR-GloVeWiki-bus-antonyms-common2.csv')
company_df = pd.read_csv('/Users/stjepankusenic/POLAR_WEBE/data/processed/POLAR-GloVeWiki-bus-antonyms-inter.csv')

new_df=pd.concat([common_df, company_df], ignore_index=True)

#display(new_df)
#display(company_df)

### Create Clusters

In [None]:
df_cluster=new_df.loc[:,new_df.columns!='Unnamed: 0']
dbscan = DBSCAN(metric='cosine', eps=0.3, min_samples=2)
cluster_labels = dbscan.fit_predict(df_cluster)
#high eps low samples only clusters common
#0.4 and 5 yield family, bank, photo, food and air cluster
#0.3 and 3 yields family, bank, food, air, car, electric, gas/oil and pharma cluster
#0.3 and 2
#0.4 and 3 yields 3 china, air, electro, water, food, animal and family cluster
#look at the clusters there
display(cluster_labels)

### Apply PCA to the Data

In [None]:
two_dim = PCA(random_state=0).fit_transform(df_cluster)[:,:2]
df_cluster[['two_dim1','two_dim2']]=two_dim.tolist()
df_cluster['cluster']=cluster_labels
df_cluster['Unnamed: 0']=new_df['Unnamed: 0']

### Create the PCA Graph

In [None]:
def display_pca_scatterplot_2D(model):
    data = []
    for i in range(-1,model['cluster'].max()+1):
        
        word_vectors = model.loc[model['cluster']==i]
        scat_text = word_vectors['Unnamed: 0']
        two_dim = word_vectors[['two_dim1','two_dim2']].to_numpy()
        trace = go.Scatter(
            x = two_dim[:,0], 
            y = two_dim[:,1],  
            text = scat_text[:],
            name = 'Cluster'+str(i+2),
            textposition = "top center",
            textfont_size = 10,#20
            mode = 'markers+text',
            marker = {
                'size': 10,#10
                'opacity': 0.8,
                'color': i
            }
        )      
        data.append(trace)

    # Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,#1200
        height = 500 #700
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

display_pca_scatterplot_2D(df_cluster[:])

### 3D PCA

In [None]:
df_cluster=new_df.loc[:,new_df.columns!='Unnamed: 0']
dbscan = DBSCAN(metric='cosine', eps=0.3, min_samples=2)
cluster_labels = dbscan.fit_predict(df_cluster)
#look at the clusters there
display(cluster_labels)

In [None]:
three_dim = PCA(random_state=0).fit_transform(df_cluster)[:,:3]
df_cluster[['three_dim1','three_dim2','three_dim3']]=three_dim.tolist()
df_cluster['cluster']=cluster_labels
df_cluster['Unnamed: 0']=new_df['Unnamed: 0']

In [None]:
def display_pca_scatterplot_3D(model):
    data = []
    for i in range(-1,model['cluster'].max()+1):
        
        word_vectors = model.loc[model['cluster']==i]
        scat_text = word_vectors['Unnamed: 0']
        three_dim = word_vectors[['three_dim1','three_dim2','three_dim3']].to_numpy()
        trace = go.Scatter3d(
            x = three_dim[:,0], 
            y = three_dim[:,1],
            z = three_dim[:,2],
            text = scat_text[:],
            name = 'Cluster'+str(i+2),
            textposition = "top center",
            textfont_size = 10,#20
            mode = 'markers+text',
            marker = {
                'size': 10,#10
                'opacity': 0.8,
                'color': i
            }
        )      
        data.append(trace)

    # Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 700
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

display_pca_scatterplot_3D(df_cluster[:])