# Unsupervisded Sentiment Analysis by word2vec + K-means clustering (1)

#### By Joyce Jiang | Code by Joyce

### There are three steps of this unsupervised NPL analysis: 
(1) word2vec model training

(2) K-means clustering to group words into positive and negative clusters

(3) Perform unsupervised NLP and predict sentiments of data sample

### Citation & Source of my code
Declare: Though I have prior knowledge of conducting text categorization through word2vec and K-means, my script (from 1 to 3) is almost fully adapted from rafaljanwojciki's tutorial on GitHub, under his repo Unsupervised-Sentiment-Analysis, you can check it out at https://github.com/rafaljanwojcik/Unsupervised-Sentiment-Analysis.

Thanks rafaljanwojciki for explaining in a digestible way for me to understand fully how to use word2vec and k-means for a supervised NLP. This script is published for study and research exploration purpose only, and it would not be used for any commercial purpose. 

In [1]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

### Import training dataset (Uganda COVID19 related content on Twitter)

In [2]:
file1 = pd.read_excel("Labeled/#covid19ug - Twitter Search ~ Twitter.xlsx")
file2 = pd.read_excel("Labeled/#keepingmy10k - Twitter Search ~ Twitter.xlsx")
file3 = pd.read_excel("Labeled/#MujoogabasajjaMwe until~2020-05-19 since~2020-05-01 - Twitter Search ~ Twitter.xlsx")
file4 = pd.read_excel("Labeled/(1) #COVID19UGRecovery - Twitter 搜尋 ~ Twitter.xlsx")
file5 = pd.read_excel("Labeled/(1) #PSFUEcon2020 - Twitter Search ~ Twitter.xlsx")
file6 = pd.read_excel("Labeled/(3) LDUs - Twitter Search ~ Twitter.xlsx")
file7 = pd.read_excel("Labeled/~Ministry Health Uganda~ OR ~Min Health Uganda~ OR ~MoH Uganda~ OR ~MoH UG~ OR ~Min Health UG~ OR ~Ministry Health UG~) - Twitter Search ~ Twitter.xlsx")
file8 = pd.read_excel("Labeled/covid task force uganda until~2020-05-19 since~2020-03-01 - Twitter Search ~ Twitter.xlsx")
file9 = pd.read_excel("Labeled/M7 - Twitter Search ~ Twitter.xlsx")
file10 = pd.read_excel("Labeled/Museveni - Twitter Search ~ Twitter.xlsx")
file11 = pd.read_excel("Labeled/Mzee - Twitter Search ~ Twitter.xlsx")
file12 = pd.read_excel("Labeled/Tweet_Museveni_Sentiment_Labeled_D1.xlsx")
file13 = pd.read_excel("Labeled/(1) #M7Address - Twitter ~ Twitter.xlsx")


In [3]:
file=pd.concat([file1['Tweet'], file2['Tweet'],file3['Tweet'], file4['Tweet'],file5['Tweet'], file6['Tweet'],file7['Tweet'], file8['Tweet']
              ,file9['Tweet'], file10['Tweet'],file11['Tweet'], file12['Tweet'],file13['Tweet']])
file=file.to_frame()

In [4]:
#drop duplicates 

file_cleaned = file.drop_duplicates(subset ="Tweet").reset_index(drop=True)
file_cleaned

Unnamed: 0,Tweet
0,His excellency president #Uganda @KagutaMuseve...
1,@Katabasasa @nbstv #NBSFrontline #NBSUpdates\n...
2,Mr. ERIAS Lukwago should know that SOPs are in...
3,Energy Minister @DrKitutu hands over the Ugx.2...
4,15 new COVID-19 cases have been confirmed by t...
...,...
45807,The people of Najera shouldn't be given bail t...
45808,Is the mask thing an example of what we gon s...
45809,M7 should do release us banange. Naye Tuffa #M...
45810,"If you have a weak heart,Please stay away from..."


#### Text cleaning 

In [12]:
def text_to_word_list(text,remove_foreign_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_foreign_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"http\S+", "", text)
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [None]:
#check the cleaning result

text_to_word_list(file_cleaned['Tweet'][0],unidecode)

In [14]:
file_cleaned.Tweet = file_cleaned.Tweet.apply(lambda x: text_to_word_list(x, unidecode))

In [15]:
#filter out sentences with length smaller than 1 word

file_model = file_cleaned.copy()
file_model = file_model[file_model.Tweet.str.len()>1]

In [17]:
#append sentence with tokenized words

from nltk.tokenize import word_tokenize
sentences=[]

for tweet in file_model.Tweet:
#    tweet_token = word_tokenize(tweet)
    sentences.append(tweet)

In [18]:
#bigram function to combine words into common phrases, comment out for this test

#sent = [row for row in file_model.Tweet]
#phrases = Phrases(sent, min_count=1, progress_per=50000)
#bigram = Phraser(phrases)
#sentences = bigram[sent]
#phrases[0]

#### Build a word2vec model

In [19]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 17:00:45: collecting all words and their counts
INFO - 17:00:45: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:00:45: collected 81374 word types from a corpus of 868346 raw words and 45043 sentences
INFO - 17:00:45: Loading a fresh vocabulary
INFO - 17:00:45: effective_min_count=3 retains 20847 unique words (25% of original 81374, drops 60527)
INFO - 17:00:45: effective_min_count=3 leaves 797256 word corpus (91% of original 868346, drops 71090)
INFO - 17:00:45: deleting the raw counts dictionary of 81374 items
INFO - 17:00:45: sample=1e-05 downsamples 4194 most-common words
INFO - 17:00:45: downsampling leaves estimated 252725 word corpus (31.7% of prior 797256)
INFO - 17:00:45: estimated required memory for 20847 words and 300 dimensions: 60456300 bytes
INFO - 17:00:45: resetting layer weights


Time to build vocab: 0.07 mins


#### Train a word2vec model (takes around 40 seconds)

In [20]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 17:01:44: training model with 3 workers on 20847 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 17:01:45: EPOCH 1 - PROGRESS: at 82.56% examples, 217238 words/s, in_qsize 5, out_qsize 0
INFO - 17:01:45: worker thread finished; awaiting finish of 2 more threads
INFO - 17:01:45: worker thread finished; awaiting finish of 1 more threads
INFO - 17:01:45: worker thread finished; awaiting finish of 0 more threads
INFO - 17:01:45: EPOCH - 1 : training on 868346 raw words (252330 effective words) took 1.1s, 227313 effective words/s
INFO - 17:01:45: worker thread finished; awaiting finish of 2 more threads
INFO - 17:01:45: worker thread finished; awaiting finish of 1 more threads
INFO - 17:01:45: worker thread finished; awaiting finish of 0 more threads
INFO - 17:01:45: EPOCH - 2 : training on 868346 raw words (252943 effective words) took 0.7s, 346627 effective words/s
INFO - 17:01:46: worker thread finished; awaiting finish of 2 more threads
INFO 

INFO - 17:02:02: worker thread finished; awaiting finish of 1 more threads
INFO - 17:02:02: worker thread finished; awaiting finish of 0 more threads
INFO - 17:02:02: EPOCH - 24 : training on 868346 raw words (252596 effective words) took 0.7s, 357529 effective words/s
INFO - 17:02:03: worker thread finished; awaiting finish of 2 more threads
INFO - 17:02:03: worker thread finished; awaiting finish of 1 more threads
INFO - 17:02:03: worker thread finished; awaiting finish of 0 more threads
INFO - 17:02:03: EPOCH - 25 : training on 868346 raw words (252304 effective words) took 0.7s, 350833 effective words/s
INFO - 17:02:04: worker thread finished; awaiting finish of 2 more threads
INFO - 17:02:04: worker thread finished; awaiting finish of 1 more threads
INFO - 17:02:04: worker thread finished; awaiting finish of 0 more threads
INFO - 17:02:04: EPOCH - 26 : training on 868346 raw words (252573 effective words) took 0.8s, 333165 effective words/s
INFO - 17:02:05: worker thread finished;

Time to train the model: 0.39 mins


In [21]:
#Save w2v model for future usage 

w2v_model.save("word2vec.model")

INFO - 17:02:25: saving Word2Vec object under word2vec.model, separately None
INFO - 17:02:25: not storing attribute vectors_norm
INFO - 17:02:25: not storing attribute cum_table
INFO - 17:02:27: saved word2vec.model


## Preprocess data sample for word2vec to perform vectorization

In [24]:
file = pd.read_excel("Labeled/Covid19UG_NoDuplicates23_05_20_labeled.xlsx")
file

Unnamed: 0.1,Unnamed: 0,Row ID,Tweet ID,Username,Tweet,Time,Tweet Type,Retweeted By,Number of Retweets,Hashtags,Mentions,Name,Location,Web,Bio,Number of Tweets,Number of Followers,Number Following,Location Coordinates,SENTIMENT
0,0,1,1263678902041526016,AhmadiyyaUg,His excellency president #Uganda @KagutaMuseve...,5/21/2020 8:51:42 PM,Retweet,AhmadiyyaKings1,4,Uganda Ahmadiyya COVID19UG,KagutaMuseveni HumanityFirstUK UgHumanityfirst...,AhmadiyyaUganda,,,Official Twitter Handle of Uganda Ahmadiyya Mu...,110,155,438,,1.0
1,1,4,1263678714124153088,NansinguzaJ,@Katabasasa @nbstv #NBSFrontline #NBSUpdates\n...,5/21/2020 8:50:58 PM,Tweet,,0,NBSFrontline NBSUpdates COVID19UG Covid_19,Katabasasa nbstv,Nansinguza Jacob,Uganda,https://t.co/8zcFVolmZH,#VisualCommunication #MedicaIillustration & #E...,38917,3300,4103,+01.25+032.5/,
2,2,7,1263677791222074880,SolomonMGrace2,Mr. ERIAS Lukwago should know that SOPs are in...,5/21/2020 8:47:18 PM,Retweet,kugonza6,1,,JudiciaryUG peter_katwesige OfwonoOpondo,Male Solomon Grace,"Kampala, Uganda",https://t.co/CUY7tvkw87,"Public Administrator, Policy Researcher and An...",8851,1246,5009,+00.31628+032.58219/,
3,3,8,1263676429994254080,Cotildakhainza,Energy Minister @DrKitutu hands over the Ugx.2...,5/21/2020 8:41:53 PM,Retweet,h_ssali,18,COVID19UG,DrKitutu Parliament_Ug,Psycho,"Kampala, Uganda",,Student of MAKERERE University Kampala. PURSUI...,1636,1341,1206,+00.31628+032.58219/,
4,4,9,1263674084405252096,matookerepublic,15 new COVID-19 cases have been confirmed by t...,5/21/2020 8:32:34 PM,Retweet,KKabajwisa,1,,,Matooke Republic,"Kampala, Uganda",https://t.co/4au2z8o6u8,Freshly Peeled Info,7186,2516,342,+00.31628+032.58219/,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5056,5056,16824,1260636518072421888,OWC_ug,@OworiSylvia : We have been engaging directly ...,5/13/2020 11:22:22 AM,Retweet,jbogube,2,,OworiSylvia OPMUganda,Wealth Creation Ug,Uganda,https://t.co/QacjAzznSK,Our mission is ''To improve household income t...,2328,3353,715,+01.25+032.5/,
5057,5057,16826,1260636408328356096,OworiSylvia,Major Rubalamira: @OWC_ug has been supplied by...,5/13/2020 11:21:55 AM,Retweet,jbogube,14,,OWC_ug GovUganda OPMUganda MAAIF_Uganda Parlia...,Sylvia Damalie Owori,,,Exec Assistant to the CC of Operation Wealth C...,48,83,107,,
5058,5058,16838,1260630805971374080,i_kagoya,It is going to be very difficult for governmen...,5/13/2020 10:59:40 AM,Retweet,ElliotOrizaarwa,1,,KagutaMuseveni RuhakanaR,Irene Kagoya,Kampala Uganda,,"A lawyer, & Women's Rights Activist, workin...",3826,792,851,+00.31628+032.58219/,
5059,5059,16839,1260630418816225024,JenifaOchwo,For making the #COVID19UG lockdown bearable......,5/13/2020 10:58:07 AM,Retweet,TheLoveDre,1,COVID19UG ThankYouUgandans,,Jenifa Ochwo,Uganda,,Thinker | Faith | Entertainment | Justice | Go...,30887,3221,831,+01.25+032.5/,


In [41]:
file_cleaned = file.drop_duplicates(subset ="Tweet").reset_index(drop=True)
file_cleaned.Tweet = file_cleaned.Tweet.apply(lambda x: text_to_word_list(x, unidecode))
file_model = file_cleaned.copy()
file_model = file_model[file_model.Tweet.str.len()>1]

sentences=[]

for tweet in file_model.Tweet:
#    tweet_token = word_tokenize(tweet)
    sentences.append(tweet)

In [42]:
sentences[0]

['his',
 'excellency',
 'president',
 'uganda',
 'kagutamuseveni',
 'recognising',
 'efforts',
 'of',
 'ahmadiyya',
 'muslim',
 'community',
 'towards',
 'covid19ug',
 'so',
 'grateful',
 'to',
 'our',
 'partners',
 'humanityfirstuk',
 'ughumanityfirst',
 'hfi1995',
 '4',
 'support',
 'accorded',
 'to',
 'the',
 'ppl',
 'of',
 'uganda',
 'ntvuganda',
 'eidmubarak',
 'eidulfitr',
 'nbsupdates']

In [43]:
file_export = file_model.copy()
file_export['old_Tweet'] = file_export.Tweet
file_export.old_Tweet = file_export.old_Tweet.str.join(' ')
file_export.Tweet = sentences
#file_export.rate = file_export.rate.astype('int8')

In [47]:
file_export[['Tweet','old_Tweet','SENTIMENT']]

Unnamed: 0,Tweet,old_Tweet,SENTIMENT
0,"[his, excellency, president, uganda, kagutamus...",his excellency president uganda kagutamuseveni...,1.0
1,"[katabasasa, nbstv, nbsfrontline, nbsupdates, ...",katabasasa nbstv nbsfrontline nbsupdates alrig...,
2,"[mr, erias, lukwago, should, know, that, sops,...",mr erias lukwago should know that sops are ins...,
3,"[energy, minister, drkitutu, hands, over, the,...",energy minister drkitutu hands over the ugx 20...,
4,"[15, new, covid, 19, cases, have, been, confir...",15 new covid 19 cases have been confirmed by t...,
...,...,...,...
5056,"[oworisylvia, we, have, been, engaging, direct...",oworisylvia we have been engaging directly the...,
5057,"[major, rubalamira, owc, ug, has, been, suppli...",major rubalamira owc ug has been supplied by t...,
5058,"[it, is, going, to, be, very, difficult, for, ...",it is going to be very difficult for governmen...,
5059,"[for, making, the, covid19ug, lockdown, bearab...",for making the covid19ug lockdown bearable tha...,


In [48]:
file_export[['Tweet','old_Tweet','SENTIMENT']].to_csv('test_dataset.csv', index=False)