In [1090]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [1091]:
word_vectors = Word2Vec.load("svmtest.model").wv

In [1092]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [1093]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=30, restrict_vocab=None)

[('kppn', 0.8069782853126526),
 ('helloo', 0.8024885654449463),
 ('megang', 0.7913312911987305),
 ('translate', 0.7840039730072021),
 ('mayan', 0.7806798219680786),
 ('menara', 0.7697272300720215),
 ('random', 0.7677547931671143),
 ('hastag', 0.7675060033798218),
 ('kemaren2', 0.7674771547317505),
 ('fd', 0.7654569149017334),
 ('yutub', 0.7648842930793762),
 ('less', 0.7632828950881958),
 ('stock', 0.7630276083946228),
 ('halus', 0.762319028377533),
 ('j', 0.7611578702926636),
 ('rin', 0.7592474222183228),
 ('gih', 0.7497272491455078),
 ('wth', 0.7490692138671875),
 ('kontrol', 0.7458175420761108),
 ('mainin', 0.744938850402832),
 ('kenapa2', 0.743679940700531),
 ('balek', 0.7426884174346924),
 ('mini', 0.7393925189971924),
 ('kain', 0.7358505725860596),
 ('ngomong2', 0.7354962825775146),
 ('signal', 0.7323988676071167),
 ('spam', 0.7313896417617798),
 ('gitar', 0.730621874332428),
 ('edukasi', 0.7287554740905762),
 ('termos', 0.7261835336685181)]

In [1094]:
positive_cluster_index = 0
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [1095]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [1096]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [1097]:
words.loc[words.cluster==0]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
1,ya,"[-0.0085440595, 0.021296458, -0.07435387, 0.00...",0,1,0.980026,0.980026
3,sayang,"[0.014767483, -0.014824989, -0.13363433, 0.074...",0,1,0.952433,0.952433
4,takut,"[0.023550805, 0.00020077561, 0.039549936, -0.0...",0,1,0.990735,0.990735
5,sih,"[-0.124964915, 0.05483642, 0.050418645, 0.0321...",0,1,1.008028,1.008028
6,nya,"[0.09494722, -0.051401827, 0.0049243225, 0.068...",0,1,0.978478,0.978478
...,...,...,...,...,...,...
3578,lacur,"[-0.057975177, -0.006227292, -0.03073205, -0.0...",0,1,1.130849,1.130849
3579,wait,"[0.024453515, 0.12028697, -0.0031088328, 0.021...",0,1,1.295165,1.295165
3582,gituu,"[0.0032095015, 0.09930181, -0.019663915, -0.01...",0,1,1.248390,1.248390
3583,05,"[-0.030377014, -0.0039329976, 0.020592565, 0.0...",0,1,1.230043,1.230043


In [1098]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)