# Imports

In [1]:
#all imports
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from IPython.display import display
from nltk import TweetTokenizer
import pickle
import os
from lda import LDA
from scipy.sparse import hstack
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# label conversion dictionaries: text to num, num to text

In [2]:
dic_aggression_level = {
    'NAG' : 1,
    'CAG' : 2,
    'OAG' : 3
}

dic_reverse_aggression_level = {}
for i in dic_aggression_level:
    dic_reverse_aggression_level[dic_aggression_level[i]] = i
    
print(dic_aggression_level, '\n', dic_reverse_aggression_level)

{'OAG': 3, 'NAG': 1, 'CAG': 2} 
 {1: 'NAG', 2: 'CAG', 3: 'OAG'}


PREPARING DATA WITH PANDAS
----

In [3]:
#train data
train_pd = shuffle(pd.concat((pd.read_csv("train.csv")[['Data', 'Label']], pd.read_csv("valid.csv")[['Data', 'Label']])), random_state=20)
# train_pd['Label'].replace('CAG', 'OAG', inplace=True)
train_pd['Label_num'] = train_pd.Label.map(dic_aggression_level)

#test data
# test_fb_pd = shuffle(pd.read_csv("test_fb.csv")[['Data', 'Label']], random_state=20)
# test_fb_pd['Label_num'] = test_fb_pd.Label.map(dic_aggression_level)
# test_tw_pd = shuffle(pd.read_csv("test_tw.csv")[['Data', 'Label']], random_state=20)
# test_tw_pd['Label_num'] = test_tw_pd.Label.map(dic_aggression_level)

#test data
test_pd = pd.read_csv("test_fb.csv")
test_pd.drop('ID',1,inplace=True)
test_pd = shuffle(test_pd, random_state = 20)

# merge binary classification (CAG -> OAG)
# test_pd['Label'].replace('CAG', 'OAG', inplace=True)

test_pd['Label_num'] = test_pd.Label.map(dic_aggression_level)



print("\n\n\nTEST DATA")
print(test_pd.Label.value_counts())
display(test_pd.head(10))

#individual classes
class_wise_train_data = {}

class_wise_test_data = {}

for i in range(1,4):
    class_wise_train_data[i] = train_pd[(train_pd['Label_num']==i)]
    class_wise_test_data[i] = test_pd[(test_pd['Label_num']==i)]
    print(class_wise_train_data[i].Label.value_counts())
    display(class_wise_train_data[i].head(10))
    print(class_wise_test_data[i].Label.value_counts())
    display(class_wise_test_data[i].head(10))





TEST DATA
NAG    630
OAG    144
CAG    142
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
156,What will be done to the money we have ?,NAG,1
211,Unchange the rapo rate could lead the stagnate...,NAG,1
13,PK Movie Bhagawan Shiv Ko Aapman Kiya.. I Ha...,NAG,1
798,Thousands of people have died due to bandhs an...,CAG,2
640,Worst F.M ever.what about 5 lakhs tax limit? ...,NAG,1
568,Pak army rape modi daughter and go back. Now m...,NAG,1
321,I am clean without cash as transactions are th...,NAG,1
119,I have 5000 shares of Pnb @75.90 please tell m...,NAG,1
820,these bhagwa terrorists can't digest their mea...,OAG,3
721,Worst Decision and very worst implementation. ...,OAG,3


NAG    6285
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
5090,"Sonia I am holding Rel cap 430, please suggest...",NAG,1
7272,Car is good.. bt i must say.. i only heard 'aa...,NAG,1
11474,till now India is stuck wheather the muslims a...,NAG,1
6979,Thank God ...what about navigation system and ...,NAG,1
391,BJP running MCD for ten years. If people stil...,NAG,1
4029,What are the prospects for the Auto and Auto A...,NAG,1
4314,"How soon do you think, the 5G will go live in ...",NAG,1
2428,Ask smith??? He said india will never gonna wi...,NAG,1
11012,Well we should keep in mind Baba's word Saurab...,NAG,1
2916,the first industrial revolution in india was b...,NAG,1


NAG    630
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
156,What will be done to the money we have ?,NAG,1
211,Unchange the rapo rate could lead the stagnate...,NAG,1
13,PK Movie Bhagawan Shiv Ko Aapman Kiya.. I Ha...,NAG,1
640,Worst F.M ever.what about 5 lakhs tax limit? ...,NAG,1
568,Pak army rape modi daughter and go back. Now m...,NAG,1
321,I am clean without cash as transactions are th...,NAG,1
119,I have 5000 shares of Pnb @75.90 please tell m...,NAG,1
524,"Sir, I am looking for the this programme on TV...",NAG,1
365,Instead of watching this crap people shud show...,NAG,1
424,Name change won't change fortune of state. Was...,NAG,1


CAG    5297
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
2930,Focus on making cash available then only peo...,CAG,2
6769,Showing everything and saying bold...,CAG,2
10305,friends we have to understand the ground reali...,CAG,2
45,Bad...........,CAG,2
5374,Third rate dog,CAG,2
10762,All those who kill their wives or abandon them...,CAG,2
5611,His father did the right thing but the root ca...,CAG,2
5948,Poison hasare,CAG,2
2036,Yes - it's a cash based economy so the steps d...,CAG,2
10261,B coz he is with B J P And lots of money.........,CAG,2


CAG    142
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
798,Thousands of people have died due to bandhs an...,CAG,2
797,Wow miracle of modi Manmohan started speaking,CAG,2
253,Yes yes ..traffic population pollution unlivab...,CAG,2
707,F. PM Manmohan Singh had 10 years now he has n...,CAG,2
684,"Hello karki , How are you?A troll paid by BJP....",CAG,2
906,When asked to speak in Parliament ran away. Sp...,CAG,2
895,"Pak Army intruded into Indian terrirtory, kill...",CAG,2
842,Modi is really tiger made them even dump once ...,CAG,2
751,The minister should also give swords to brideg...,CAG,2
731,How will it reduce black money ??? can anyone ...,CAG,2


OAG    3419
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
5103,She's so ignorant Megha Mukherji,OAG,3
9074,why dont u make ur room sound proof..simple,OAG,3
140,Then what happens in pantry coach dedicated fo...,OAG,3
1756,We should respect every religion. May be he wa...,OAG,3
3168,It's like a devil think .. what tha hell is th...,OAG,3
9367,"India is a fake, Fragile and unatural union bu...",OAG,3
2796,Police did the right thing . Indian express ha...,OAG,3
249,Cheap MLA..,OAG,3
4173,Triple Talak must be ban in india ......,OAG,3
2621,Corrupted old man.. ....one word for ANNA,OAG,3


OAG    144
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
820,these bhagwa terrorists can't digest their mea...,OAG,3
721,Worst Decision and very worst implementation. ...,OAG,3
819,Pakistanis are not human being. They are just ...,OAG,3
755,"The most useless PM India ever seen, he could ...",OAG,3
748,Mr. Manmohan Singh...You are such a highly lea...,OAG,3
585,"Omer Beigh you mean to say ,then they can talk...",OAG,3
688,Feeling proud to be a citizen of developing an...,OAG,3
412,..Who is she anyway..her nonsense gets recogni...,OAG,3
693,Islam is not a religion and Allah is a big pie...,OAG,3
686,"Even if modi ji farts, bhakts be like wow what...",OAG,3


In [5]:
def something(topics):
    lda_vect = LDA(n_topics=topics, n_iter=100)
    count_vect = CountVectorizer()
    tfidf_vect = TfidfVectorizer()

    count_train = count_vect.fit_transform(train_pd['Data'])
    lda_train = lda_vect.fit_transform(count_train)
    tfidf_train = tfidf_vect.fit_transform(train_pd['Data'])
    X_train = hstack([tfidf_train, lda_train])
    y_train = train_pd['Label_num']

    X_test = hstack([tfidf_vect.transform(test_pd['Data']), lda_vect.transform(count_vect.transform(test_pd['Data']))])
    y_test = test_pd['Label_num']

    svm = SVC(kernel='linear')
    svm.fit(X_train, train_pd['Label_num'])

    y_pred = svm.predict(X_test)

    print(topics, accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted'),
         confusion_matrix(y_test, y_pred))
    
from multiprocessing import Pool
process_pool = Pool(50)
process_pool.map(something, list(range(50, 301, 10)))
process_pool.close()
process_pool.join()
# something(20)

INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 180
INFO:lda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 210
INFO:lda:n_iter: 100
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 130
INFO:lda:n_iter: 100
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 50
INFO:lda:n_iter: 100
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 110
INFO:lda:n_iter: 100
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_documents: 15001
INFO:lda:n_topics: 170
INFO:lda:n_documents: 15001
INFO:lda:vocab_size: 24374
INFO:lda:n_iter: 100
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_words: 354658
INFO:lda:n_topics: 280
INFO:lda:n_topics:

  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:n_documents: 15001
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:vocab_size: 24374
INFO:lda:n_words: 354658
INFO:lda:n_topics: 250
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -5105602
INFO:lda:<0> log likelihood: -5257511
INFO:lda:<0> log likelihood: -4937476
INFO:lda:<0> log likelihood: -5232

INFO:lda:<20> log likelihood: -3409071
INFO:lda:<80> log likelihood: -3031018
INFO:lda:<50> log likelihood: -3121958
INFO:lda:<30> log likelihood: -3268669
INFO:lda:<30> log likelihood: -3242329
INFO:lda:<20> log likelihood: -3388788
INFO:lda:<40> log likelihood: -3190296
INFO:lda:<40> log likelihood: -3170517
INFO:lda:<80> log likelihood: -3051297
INFO:lda:<50> log likelihood: -3135191
INFO:lda:<30> log likelihood: -3254130
INFO:lda:<60> log likelihood: -3085179
INFO:lda:<90> log likelihood: -3024462
INFO:lda:<30> log likelihood: -3296080
INFO:lda:<20> log likelihood: -3416280
INFO:lda:<20> log likelihood: -3445343
INFO:lda:<20> log likelihood: -3428499
INFO:lda:<90> log likelihood: -3043979
INFO:lda:<99> log likelihood: -3017974
INFO:lda:<20> log likelihood: -3445520
INFO:lda:<60> log likelihood: -3107161
INFO:lda:<70> log likelihood: -3074916
INFO:lda:<20> log likelihood: -3460192
INFO:lda:<30> log likelihood: -3283514
INFO:lda:<50> log likelihood: -3149160
INFO:lda:<40> log likelih

50 0.5611353711790393 0.6002687057611727 [[361 218  51]
 [ 36  86  20]
 [ 25  52  67]]


INFO:lda:<60> log likelihood: -3342790
INFO:lda:<80> log likelihood: -3250875
INFO:lda:<99> log likelihood: -3188600
INFO:lda:<80> log likelihood: -3262800
INFO:lda:<70> log likelihood: -3305089
INFO:lda:<60> log likelihood: -3348743
INFO:lda:<99> log likelihood: -3203445
INFO:lda:<80> log likelihood: -3269930
INFO:lda:<70> log likelihood: -3313958
INFO:lda:<99> log likelihood: -3217316
INFO:lda:<90> log likelihood: -3231540
INFO:lda:<80> log likelihood: -3286472
INFO:lda:<90> log likelihood: -3243885


60 0.5633187772925764 0.6033123461243413 [[362 221  47]
 [ 40  83  19]
 [ 23  50  71]]


INFO:lda:<90> log likelihood: -3252166
INFO:lda:<80> log likelihood: -3293514
INFO:lda:<70> log likelihood: -3330243
INFO:lda:<90> log likelihood: -3260520
INFO:lda:<99> log likelihood: -3227235
INFO:lda:<80> log likelihood: -3302330
INFO:lda:<70> log likelihood: -3339393
INFO:lda:<90> log likelihood: -3277669
INFO:lda:<99> log likelihood: -3240555


70 0.5753275109170306 0.6127054529261785 [[370 211  49]
 [ 36  86  20]
 [ 26  47  71]]


INFO:lda:<99> log likelihood: -3248677
INFO:lda:<99> log likelihood: -3253737
INFO:lda:<90> log likelihood: -3286352
INFO:lda:<80> log likelihood: -3318582


80 0.5687772925764192 0.6073274173032666 [[363 218  49]
 [ 32  90  20]
 [ 25  51  68]]


INFO:lda:<90> log likelihood: -3294742
INFO:lda:<99> log likelihood: -3272966
INFO:lda:<80> log likelihood: -3328690
INFO:lda:<99> log likelihood: -3280245
INFO:lda:<90> log likelihood: -3310508
INFO:lda:<99> log likelihood: -3287119


90 0.5644104803493449 0.6027784366619292 [[360 221  49]
 [ 33  89  20]
 [ 27  49  68]]


INFO:lda:<90> log likelihood: -3322436


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

INFO:lda:<99> log likelihood: -3306270


110 0.5567685589519651 0.5958809365250254 [[356 223  51]
 [ 34  87  21]
 [ 27  50  67]]


INFO:lda:<99> log likelihood: -3314037


130 0.5611353711790393 0.6003646554263183 [[361 216  53]
 [ 39  82  21]
 [ 24  49  71]]
140 0.5622270742358079 0.6027769201836191 [[356 227  47]
 [ 36  86  20]
 [ 22  49  73]]
160 0.5513100436681223 0.5899359870596186 [[352 228  50]
 [ 33  89  20]
 [ 30  50  64]]
170 0.5600436681222707 0.5991123585224161 [[364 216  50]
 [ 39  83  20]
 [ 26  52  66]]
180 0.5622270742358079 0.6010760779125209 [[363 216  51]
 [ 36  85  21]
 [ 26  51  67]]
190 0.5491266375545851 0.5879646334114175 [[352 224  54]
 [ 34  87  21]
 [ 29  51  64]]
200 0.5600436681222707 0.5998154438077892 [[361 218  51]
 [ 37  84  21]
 [ 24  52  68]]
210 0.5622270742358079 0.6025036278084566 [[356 225  49]
 [ 31  91  20]
 [ 22  54  68]]
230 0.5644104803493449 0.6035819801268687 [[359 223  48]
 [ 35  87  20]
 [ 25  48  71]]
240 0.5545851528384279 0.5935124173280635 [[353 228  49]
 [ 31  90  21]
 [ 29  50  65]]
220 0.5622270742358079 0.6020467540550084 [[360 221  49]
 [ 35  88  19]
 [ 23  54  67]]
250 0.5589519650655022 0.5983432