<a href="https://colab.research.google.com/github/harrymkwn/hackinutu/blob/main/Unsupervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tqdm import tqdm
from collections import defaultdict
import itertools
import emoji
import re
import tensorflow as tf
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import multiprocessing
from google.colab import  drive
drive.mount('/content/drive')

In [None]:
def parse_lines(lines):
    u = [] # uids
    t = [] # tokens
    l = [] # token labels
    s = [] # sentiment labels
    max_length = 0

    print("Parsing lines from file...")
    for i, line in tqdm(enumerate(lines), total=len(lines)):
        line = line.strip().split('\t')
        if line[0]=='meta':
            if i!=0:
                u.append(buffer_id)
                t.append(buffer_tokens)
                l.append(buffer_labels)
                s.append(buffer_sentiment)
                if len(buffer_tokens) > max_length:
                    max_length = len(buffer_tokens)
            buffer_id = line[1]
            try:
                buffer_sentiment = line[2]
            except:
                buffer_sentiment = ''
            buffer_tokens = []
            buffer_labels = []
        else:
            buffer_tokens.append(line[0])
            try:
                buffer_labels.append(line[1])
            except:
                buffer_labels.append('')

    u.append(buffer_id)
    t.append(buffer_tokens)
    l.append(buffer_labels)
    s.append(buffer_sentiment)
    if len(buffer_tokens) > max_length:
        max_length = len(buffer_tokens)

    num_samples = len(u)
    
    return u, t, l, s, max_length


In [None]:
train = open('/content/drive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/Hinglish_train_14k_split_conll.txt', encoding='utf8').readlines()
valid = open('/content/drive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/Hinglish_dev_3k_split_conll.txt', encoding='utf8').readlines()
test = open('/content/drive/My Drive/InfluenceAnalysis/CodeMix/Hinglish/Hinglish_test_unalbelled_conll_updated.txt', encoding='utf8').readlines()

u_train, t_train, l_train, s_train, max_length = parse_lines(train)
u_dev, t_dev, l_dev, s_dev, max_length_dev = parse_lines(valid)
u_test, t_test, l_test, s_test, max_length_test = parse_lines(test)


 20%|█▉        | 78330/393560 [00:00<00:00, 783298.88it/s]

Parsing lines from file...


100%|██████████| 393560/393560 [00:00<00:00, 887882.49it/s]
100%|██████████| 84678/84678 [00:00<00:00, 830168.72it/s]
  0%|          | 0/84362 [00:00<?, ?it/s]

Parsing lines from file...
Parsing lines from file...


100%|██████████| 84362/84362 [00:00<00:00, 852266.81it/s]


In [None]:
positive = pd.read_csv('/content/drive/My Drive/InfluenceAnalysis/CodeMix/positive-words.csv')
positive = list(positive['0'])
negative = pd.read_csv('/content/drive/My Drive/InfluenceAnalysis/CodeMix/negative-words.csv')
negative = list(negative['0'])

print(positive)
print(negative)

['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade', 'accolades', 'accommodative', 'accomodative', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accurate', 'accurately', 'achievable', 'achievement', 'achievements', 'achievible', 'acumen', 'adaptable', 'adaptive', 'adequate', 'adjustable', 'admirable', 'admirably', 'admiration', 'admire', 'admirer', 'admiring', 'admiringly', 'adorable', 'adore', 'adored', 'adorer', 'adoring', 'adoringly', 'adroit', 'adroitly', 'adulate', 'adulation', 'adulatory', 'advanced', 'advantage', 'advantageous', 'advantageously', 'advantages', 'adventuresome', 'adventurous', 'advocate', 'advocated', 'advocates', 'affability', 'affable', 'affably', 'affectation', 'affection', 'affectionate', 'affinity', 'affirm', 'affirmation', 'affirmative', 'affluence', 'affluent', 'afford', 'affordable', 'affordably', 'afordable', 'agile', 'agilely', 'agility', 'agreeable', 'ag

In [None]:
def load_dict_smileys():
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }

# source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
def load_dict_contractions():
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }


def tweet_cleaning_for_sentiment_analysis(tweet):
    # lower case
    tweet = tweet.lower()
        
    # replace contractions
    CONTRACTIONS = load_dict_contractions()
    tweet = tweet.replace("’","'")
    words = tweet.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    tweet = " ".join(reformed)
    
    # standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    
    # replace emoticons
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    
    # demojize emojis
    tweet = emoji.demojize(tweet)
    
    # other cleaning
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split())
    # replace duplicate characters
    tweet = re.sub(r"(.)\1{2,}", r'\1\1', tweet)

    return tweet

def clean(t,l):
    for i in range(len(t)):
        temp = tweet_cleaning_for_sentiment_analysis(' '.join(t[i])).split(' ')
        t[i] = []
        j=0
        while j<len(temp):
            t[i].append(temp[j])
            j+=1
        
    return t, l

In [None]:
t_train, l_train = clean(t_train, l_train)
t_dev, l_dev = clean(t_dev, l_dev)
t_test, l_test = clean(t_test, l_test)
print(t_train[:10])

[['nen', 'á', 'vist', 'bolest', 'vztek', 'smutek', 'zmatek', 'osam', 'ě', 'lost', 'beznad', 'ě', 'j', 'a', 'nakonec', 'jen', 'klid', 'asi', 'takhle', 'vypad', 'á', 'm', 'ů', 'j', 'life', '..'], ['@', 'nehantics', 'haan', 'yaar', 'neha', 'pensive_face', 'pensive_face', 'kab', 'karega', 'woh', 'post', 'loudly_crying_face', 'usne', 'na', 'sach', 'mein', 'photoshoot', 'karna', 'chahiye', 'phir', 'woh', 'post', 'karega', '…', 'https', '//', 'tco', '/', '5rslsbzntt'], ['@', 'rahulgandhi', 'television', 'media', 'congress', 'ke', 'liye', 'nhi', 'h', '.', 'ye', 'toh', 'aapko', 'pata', 'chal', 'hi', 'gya', 'hoga', '.', 'achha', 'hoga', 'ki', 'congress', 'ke', '…', 'https', '//', 't', '.', 'co', '/', 'hmh8m7ptak'], ['@', 'amitshah', '@', 'narendramodi', 'all', 'india', 'me', 'nrc', 'lagu', 'kare', 'w', 'kashmir', 'se', 'dhara', '370ko', 'khatam', 'kare', 'ham', 'indian', 'ko', 'apse', 'yahi', 'umid', 'hai'], ['@', 'nehr', '_', 'who', '@', 'typomantri', '@', 'anjanaomkashyap', 'pagal', 'hai', 'ky

In [None]:
type(t_train)
len(t_train)
w2v_model = Word2Vec(min_count=10,
                     window=7,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=4)
w2v_model.build_vocab(t_train, progress_per=1000)
w2v_model.train(t_train, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print(w2v_model)

Word2Vec(vocab=3048, size=300, alpha=0.03)


In [None]:
dir(w2v_model)

In [None]:
w2v_model.wv.most_similar(positive=["nhi"])

  if np.issubdtype(vec.dtype, np.int):


[('h', 0.9997829794883728),
 ('tum', 0.9994261264801025),
 ('ab', 0.999393105506897),
 ('koi', 0.9993726015090942),
 ('log', 0.9991849064826965),
 ('ne', 0.9991600513458252),
 ('desh', 0.9990662932395935),
 ('hain', 0.9988721013069153),
 ('bjp', 0.9987276792526245),
 ('https', 0.9986196160316467)]

In [None]:
model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=100).fit(X=w2v_model.wv.vectors)
negative_cluster_center = model.cluster_centers_[0]
neutral_cluster_center = model.cluster_centers_[1]
positive_cluster_center = model.cluster_centers_[2]

In [None]:
words = pd.DataFrame(w2v_model.wv.vocab.keys())
all_words = list(words[0])
print(all_words)
print(words.head())
words.columns = ['words']
words['vectors'] = words['words'].apply(lambda x: w2v_model.wv[f'{x}'])

words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)]))
words.cluster = words['cluster'].apply(lambda x: x[0])
print(words.head())

words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
print(words.head())

words.set_index('words',inplace=True)
print(words.head())
print(words.loc['lost']['cluster'])

['á', 'lost', 'j', 'a', 'm', 'life', '..', '@', 'haan', 'yaar', 'pensive_face', 'kab', 'karega', 'woh', 'post', 'loudly_crying_face', 'usne', 'na', 'sach', 'mein', 'karna', 'chahiye', 'phir', '…', 'https', '//', 'tco', '/', 'rahulgandhi', 'media', 'congress', 'ke', 'liye', 'nhi', 'h', '.', 'ye', 'toh', 'aapko', 'pata', 'chal', 'hi', 'gya', 'hoga', 'achha', 'ki', 't', 'co', 'amitshah', 'narendramodi', 'all', 'india', 'me', 'kare', 'w', 'kashmir', 'se', 'dhara', 'khatam', 'ham', 'indian', 'ko', 'apse', 'yahi', 'umid', 'hai', 'nehr', '_', 'who', 'anjanaomkashyap', 'pagal', 'kya', '?', 'they', "'", 'real', 'mandir', 'is', 'important', 'hindu', 'mei', 'jo', '!', 'jeet', 'dher', 'sari', 'subh', 'modi', 'ji', 'asha', 'karta', 'hu', 'desh', 'janta', 'ne', 'kar', 'bihar', 'but', 'walay', 'babu', 'tau', 'new', 'job', 'gi', 'aajtak', 'chitraaum', 'walon', 'bhi', 'maza', 'ata', 'muslim', 'debate', 'karne', 'tarekfatah', 'tere', 'pakistan', 'therealpcb', 'chutiye', 'khelne', 'de', 'unhe', 'bc', 'tu

In [None]:
print(type(t_train))
print(type(t_train[0]))
print(w for w in t_train[0])

a = ["a","a","a"]
print([x for x in a])

a = [1,2,4,5]
print(sum(a))

<class 'list'>
<class 'list'>
<generator object <genexpr> at 0x7f3dfb2d4f10>
['a', 'a', 'a']
12


In [None]:
cluster_score = [sum([ words.loc[w]['closeness_score'] for w in x if (w in all_words and words.loc[w]['cluster']==0)]) for x in t_train]

In [None]:
cluster_score1 = [sum([ words.loc[w]['closeness_score'] for w in x if (w in all_words and words.loc[w]['cluster']==1)]) for x in t_train]

In [None]:
cluster_score2 = [sum([ words.loc[w]['closeness_score'] for w in x if (w in all_words and words.loc[w]['cluster']==2)]) for x in t_train]

In [None]:
tweets = [' '.join(x) for x in t_train]
print(tweets[:10])

['nen á vist bolest vztek smutek zmatek osam ě lost beznad ě j a nakonec jen klid asi takhle vypad á m ů j life ..', '@ nehantics haan yaar neha pensive_face pensive_face kab karega woh post loudly_crying_face usne na sach mein photoshoot karna chahiye phir woh post karega … https // tco / 5rslsbzntt', '@ rahulgandhi television media congress ke liye nhi h . ye toh aapko pata chal hi gya hoga . achha hoga ki congress ke … https // t . co / hmh8m7ptak', '@ amitshah @ narendramodi all india me nrc lagu kare w kashmir se dhara 370ko khatam kare ham indian ko apse yahi umid hai', "@ nehr _ who @ typomantri @ anjanaomkashyap pagal hai kya ? they aren ' t real issues mandir is important hindu khatre mei jo hai !", '@ narendramodi jeet ki dher sari subh kamnaye modi ji asha karta hu jistarah desh ki janta ne khas kar bihar ki jant … https // t co / xpczj7gfqc', '@ fakeionist @ samjhotaxpress @ plichapel but topi walay babu ki tau new job hougi humey chutti kesay milay gi ? loudly_crying_face 

In [None]:
def max_find(row):
  return max(row[0],max(row[1],row[2]))

In [None]:
def max_cluster(row):
  
  return 0 if row[0]>row[1] and row[0]>row[2] else 1 if row[1]>row[2] else 2

In [None]:
dic = {'0': cluster_score,'1':cluster_score1,'2':cluster_score2}
clusters_score = pd.DataFrame(dic)
# print(clusters_score)
clusters_score['sentiment'] = clusters_score.apply(lambda row : max_find(row),axis=1)
clusters_score['cluster'] = clusters_score.apply(lambda row : max_cluster(row),axis=1)

clusters_score.head(100)

Unnamed: 0,0,1,2,sentiment,cluster
0,8.898035,13.023934,2.869511,13.023934,1
1,67.506583,2.138689,0.000000,67.506583,0
2,60.865258,0.000000,0.000000,60.865258,0
3,52.891367,3.469800,1.408389,52.891367,0
4,34.699346,16.397625,3.728629,34.699346,0
...,...,...,...,...,...
95,13.653378,11.283460,10.093418,13.653378,0
96,55.034941,3.409673,1.893247,55.034941,0
97,15.270427,18.325283,6.945798,18.325283,1
98,39.013388,0.000000,2.634449,39.013388,0


In [None]:
clusters_score['cluster'].value_counts()

0    11757
1     1836
2      407
Name: cluster, dtype: int64

In [159]:
cluster0 = words[words['cluster']==0]
cluster1 = words[words['cluster']==1]
cluster2 = words[words['cluster']==2]

cluster0.reset_index(inplace=True)
cluster0 = list(cluster0['words'])
cluster1.reset_index(inplace=True)
cluster1 = list(cluster1['words'])
cluster2.reset_index(inplace=True)
cluster2 = list(cluster2['words'])

print(cluster0)
print(cluster1)
print(cluster2)

#for positive




['j', '..', '@', 'haan', 'yaar', 'pensive_face', 'kab', 'karega', 'woh', 'post', 'usne', 'na', 'sach', 'mein', 'karna', 'chahiye', 'phir', '…', 'https', '//', 'tco', '/', 'rahulgandhi', 'media', 'congress', 'ke', 'liye', 'nhi', 'h', '.', 'ye', 'toh', 'aapko', 'pata', 'chal', 'hi', 'gya', 'hoga', 'achha', 'ki', 't', 'co', 'amitshah', 'narendramodi', 'india', 'me', 'kare', 'kashmir', 'se', 'dhara', 'khatam', 'ham', 'indian', 'ko', 'apse', 'yahi', 'umid', 'hai', 'nehr', 'anjanaomkashyap', 'pagal', 'kya', 'mandir', 'hindu', 'mei', 'jo', 'jeet', 'dher', 'sari', 'modi', 'asha', 'karta', 'hu', 'desh', 'janta', 'ne', 'kar', 'bihar', 'walay', 'babu', 'tau', 'gi', 'aajtak', 'chitraaum', 'walon', 'bhi', 'maza', 'ata', 'muslim', 'debate', 'karne', 'tarekfatah', 'tere', 'pakistan', 'therealpcb', 'chutiye', 'khelne', 'de', 'unhe', 'bc', 'tu', 'nahi', 'shaadi', 'seat', 'vote', 'aur', 'apne', 'pak', 'os', 'k', 'dil', 'ka', 'nai', 'krta', 'sb', 'ache', 'aor', 'dua', 'ma', 'ho', 'bahut', 'faisla', 'liya

TypeError: ignored

In [None]:
w2v_model.wv.similar_by_vector(model.cluster_centers_[2], topn=10, restrict_vocab=None)


  if np.issubdtype(vec.dtype, np.int):


[('&', 0.9997625350952148),
 ('god', 0.9997386932373047),
 ('day', 0.9995259046554565),
 ('very', 0.9992626905441284),
 ('much', 0.9991232752799988),
 ('!!', 0.9990979433059692),
 ('thank', 0.999026894569397),
 ('best', 0.9989185333251953),
 ('hope', 0.998799741268158),
 ('birthday', 0.9987848997116089)]

In [None]:
w2v_model.wv.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)


  if np.issubdtype(vec.dtype, np.int):


[('lady', 0.9999334812164307),
 ('energy', 0.9999284744262695),
 ('voice', 0.9999275803565979),
 ('cat', 0.9999254941940308),
 ('winning', 0.9999242424964905),
 ('end', 0.9999192357063293),
 ('missing', 0.9999159574508667),
 ('mind', 0.9999157190322876),
 ('bro', 0.9999149441719055),
 ('af', 0.9999148845672607)]

In [None]:
w2v_model.wv.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)


  if np.issubdtype(vec.dtype, np.int):


[('jayega', 0.9999430179595947),
 ('pankhuripathak', 0.9999408721923828),
 ('chup', 0.9999403357505798),
 ('kai', 0.9999394416809082),
 ('bik', 0.9999385476112366),
 ('ashraffem', 0.9999385476112366),
 ('thesamirabbas', 0.999938428401947),
 ('jitni', 0.9999381303787231),
 ('bo', 0.999937891960144),
 ('dalali', 0.9999377131462097)]