# Imports

In [1]:
#all imports
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from IPython.display import display
from nltk import TweetTokenizer
import pickle
import os

pd.options.display.max_rows = 100

# label conversion dictionaries: text to num, num to text

In [2]:
dic_aggression_level = {
    'NAG' : 1,
    'CAG' : 2,
    'OAG' : 3
}

dic_reverse_aggression_level = {}
for i in dic_aggression_level:
    dic_reverse_aggression_level[dic_aggression_level[i]] = i
    
print(dic_aggression_level, '\n', dic_reverse_aggression_level)

{'CAG': 2, 'OAG': 3, 'NAG': 1} 
 {1: 'NAG', 2: 'CAG', 3: 'OAG'}


PREPARING DATA WITH PANDAS
----

In [3]:
#train data
train_pd = shuffle(pd.concat((pd.read_csv("train.csv")[['Data', 'Label']], pd.read_csv("valid.csv")[['Data', 'Label']])), random_state=20)
# train_pd['Label'].replace('CAG', 'OAG', inplace=True)
train_pd['Label_num'] = train_pd.Label.map(dic_aggression_level)

#test data
# test_fb_pd = shuffle(pd.read_csv("test_fb.csv")[['Data', 'Label']], random_state=20)
# test_fb_pd['Label_num'] = test_fb_pd.Label.map(dic_aggression_level)
# test_tw_pd = shuffle(pd.read_csv("test_tw.csv")[['Data', 'Label']], random_state=20)
# test_tw_pd['Label_num'] = test_tw_pd.Label.map(dic_aggression_level)

#test data
test_pd = pd.read_csv("test_fb.csv")
test_pd.drop('ID',1,inplace=True)
test_pd = shuffle(test_pd, random_state = 20)

# merge binary classification (CAG -> OAG)
# test_pd['Label'].replace('CAG', 'OAG', inplace=True)

test_pd['Label_num'] = test_pd.Label.map(dic_aggression_level)


print("TRAIN DATA")
print(train_pd.Label.value_counts())
display(train_pd.head(10))

print("\n\n\nTEST DATA")
print(test_pd.Label.value_counts())
display(test_pd.head(10))

TRAIN DATA
NAG    6285
CAG    5297
OAG    3419
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
2930,Focus on making cash available then only peo...,CAG,2
5103,She's so ignorant Megha Mukherji,OAG,3
5090,"Sonia I am holding Rel cap 430, please suggest...",NAG,1
9074,why dont u make ur room sound proof..simple,OAG,3
6769,Showing everything and saying bold...,CAG,2
140,Then what happens in pantry coach dedicated fo...,OAG,3
1756,We should respect every religion. May be he wa...,OAG,3
7272,Car is good.. bt i must say.. i only heard 'aa...,NAG,1
10305,friends we have to understand the ground reali...,CAG,2
45,Bad...........,CAG,2





TEST DATA
NAG    630
OAG    144
CAG    142
Name: Label, dtype: int64


Unnamed: 0,Data,Label,Label_num
156,What will be done to the money we have ?,NAG,1
211,Unchange the rapo rate could lead the stagnate...,NAG,1
13,PK Movie Bhagawan Shiv Ko Aapman Kiya.. I Ha...,NAG,1
798,Thousands of people have died due to bandhs an...,CAG,2
640,Worst F.M ever.what about 5 lakhs tax limit? ...,NAG,1
568,Pak army rape modi daughter and go back. Now m...,NAG,1
321,I am clean without cash as transactions are th...,NAG,1
119,I have 5000 shares of Pnb @75.90 please tell m...,NAG,1
820,these bhagwa terrorists can't digest their mea...,OAG,3
721,Worst Decision and very worst implementation. ...,OAG,3


In [4]:
max_features = 20000

feature_dict = {
    'unigram' : TfidfVectorizer(max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'bigram'  : TfidfVectorizer(ngram_range=(2,2), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'trigram'  : TfidfVectorizer(ngram_range=(3,3), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'quadgram'  : TfidfVectorizer(ngram_range=(4,4), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'fivegram'  : TfidfVectorizer(ngram_range=(5,5), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-2_gram'  : TfidfVectorizer(ngram_range=(1, 2), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-3_gram'  : TfidfVectorizer(ngram_range=(1, 3), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-4_gram'  : TfidfVectorizer(ngram_range=(1, 4), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '2-3_gram'  : TfidfVectorizer(ngram_range=(2, 3), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '2-4_gram'  : TfidfVectorizer(ngram_range=(2, 4), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '3-4_gram'  : TfidfVectorizer(ngram_range=(3, 4), max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'unigram_without_stopwords' : TfidfVectorizer(stop_words='english', 
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'bigram_without_stopwords'  : TfidfVectorizer(ngram_range=(2,2), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'trigram_without_stopwords'  : TfidfVectorizer(ngram_range=(3,3), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'quadgram_without_stopwords'  : TfidfVectorizer(ngram_range=(4,4), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    'fivegram_without_stopwords'  : TfidfVectorizer(ngram_range=(5,5), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-2_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 2), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-3_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 3), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-4_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 4), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '2-3_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(2, 3), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '2-4_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(2, 4), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '3-4_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(3, 4), stop_words='english',
                                                  max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
    '1-1_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,1), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-2_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,2), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-3_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,3), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,4), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,5), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'1-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-2_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,2), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-3_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,3), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,4), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,5), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'2-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-3_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,3), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,4), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,5), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'3-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'4-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,4), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'4-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,5), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'4-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'4-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'4-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'5-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(5,5), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'5-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(5,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'5-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(5,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'5-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(5,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'6-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(6,6), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'6-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(6,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'6-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(6,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'7-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(7,7), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'7-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(7,8), 
                                    max_features=max_features, min_df=3, tokenizer=TweetTokenizer().tokenize),
	'8-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(8,8), 
                                    max_features=max_features, min_df=3)
}

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from nltk import TweetTokenizer

word_count_dict = {}
word_set = set()
tweeter = TweetTokenizer()

#NAG
for row_number in range(len(train_pd)):
    row = train_pd.iloc[row_number]
    for word in set(tweeter.tokenize(row.Data)):
        if word not in word_set:
            word_set.add(word)
            word_count_dict[word] = [0, 0, 0]
        word_count_dict[word][row.Label_num-1] += 1

from functools import reduce
def entropy(x):
    if x == 0:
        return -0
    return x*np.log(x)

word_entropy_dict = {}
_good = []
_bad = []
_all = []

for word in word_count_dict:
    total_words = np.sum(word_count_dict[word])
    word_entropy_dict[word] = np.abs(np.sum(list(map(lambda x: entropy(x/total_words), np.array(word_count_dict[word])))))
    
# conversion_ratio = len(word_entropy_dict.items())/np.sum(list(zip(*word_entropy_dict.items()))[1])
entropy_list = list(zip(*word_entropy_dict.items()))[1]
max_entropy = max(entropy_list)
min_entropy = min(entropy_list)
count_list = np.log(np.sum(np.array(list(zip(*word_count_dict.items()))[1]), axis=1))
count_list = count_list[count_list != 0]
count_list = 1/count_list
max_count = max(count_list)
min_count = min(count_list)
print(max_entropy, min_entropy, max_count, min_count)
del entropy_list
del count_list
def map_count_to_entropy_range(x):
    return min_entropy + (x - min_count) * (max_entropy - min_entropy) / (max_count - min_count)
    
count_weight = 7
for word in word_count_dict:
    _all.append((word, word_count_dict[word], (word_entropy_dict[word]) + count_weight*map_count_to_entropy_range(1/np.log(np.sum(word_count_dict[word])))))
    word_entropy_dict[word] += count_weight*map_count_to_entropy_range(1/np.log(np.sum(word_count_dict[word])))
#     if word_entropy_dict[word] + 1/np.log(np.sum(word_count_dict[word])) < 1.0986122886681096 and total_words > 3:
#         _good.append((word, word_count_dict[word], word_entropy_dict[word]+ conversion_ratio/np.log(np.sum(word_count_dict[word]))))
#     else:
#         _bad.append((word, word_count_dict[word], word_entropy_dict[word]+ conversion_ratio/np.log(np.sum(word_count_dict[word]))))
        

1.0986122886681096 0.0 1.4426950408889634 0.1143182336230831




In [11]:
display(pd.DataFrame(_all, columns=['word', 'count', 'entropy']).sort_values('entropy').head(12500).tail(100))

Unnamed: 0,word,count,entropy
20928,sheep,"[1, 1, 0]",5.545177
19937,emerged,"[1, 1, 0]",5.545177
848,Pvt,"[1, 1, 0]",5.545177
7178,peopl,"[1, 1, 0]",5.545177
881,92,"[1, 1, 0]",5.545177
18592,RAPE,"[1, 1, 0]",5.545177
28137,insects,"[1, 1, 0]",5.545177
1340,Mahatma,"[1, 1, 0]",5.545177
1339,rejecting,"[1, 1, 0]",5.545177
3235,FEW,"[1, 1, 0]",5.545177


In [75]:
word_search = 'grounds'
count = 0

for sentence in train_pd['Data']:
    if word_search in tweeter.tokenize(sentence):
        print(count, '\n', sentence, end='\n\n\n')
        count += 1

0 
 Dear Indian Express,
I feel sorry to write that your journalists are out there to misinterpret events and present them in a distorted manner just to make some sensational headlines. The news about upcoming singer Nahid Afreen has been prepared without proper research and investigation. It was very sad to see that your journalists don't even know the difference between fatwa and appeal. 
A fatwa is a non-binding legal opinion issued only by a qualified jurist (Mufti) on a request by someone who is unable to understand religious guidelines whereas the leaflet that is being circulated and presented as Fatwa is just a hard copy of an appeal signed by 46 men who believe in religious practices. The merit of appeal can be discussed on many grounds and in many dimensions but first and foremost, an appeal can not be called a legal opinion (Fatwa). 
We expect apology from your side for such a mistake.
http://indianexpress.com/article/india/fatwa-issued-against-reality-singing-star-nahid-

11 
 http://indianexpress.com/article/india/fatwa-issued-against-reality-singing-star-nahid-afrin-by-42-clerics-4569825/

Dear Indian Express,
I feel sorry to write that your journalists are out there to misinterpret events and present them in a distorted manner just to make some sensational headlines. The news about upcoming singer Nahid Afreen has been prepared without proper research and investigation. It was very sad to see that your journalists don't even know the difference between fatwa and appeal. 
A fatwa is a non-binding legal opinion issued only by a qualified jurist (Mufti) on a request by someone who is unable to understand religious guidelines whereas the leaflet that is being circulated and presented as Fatwa is just a hard copy of an appeal signed by 46 men who believe in religious practices. The merit of appeal can be discussed on many grounds and in many dimensions but first and foremost, an appeal can not be called a legal opinion (Fatwa). 
We expect apology fro

27 
 Cow ban - no forced religiousness 
An old man beaten to deat on doubtful grounds by cow vigilants - no forced religiousness.
Day and might concerts with loud speakers : no forced religiousness. Use of loud speakers on navratre/ganesh chathurthi/holi/ jagran : no forced religiousness.
1 min azan : forced religiousness 
We muslims can give up cow meat for our hindu friends!! No big deal, but such horrendous statements by hindu celebrities are disgrace to secular status***


28 
 Dear Indian Express,
I feel sorry to write that your journalists are out there to misinterpret events and present them in a distorted manner just to make some sensational headlines. The news about upcoming singer Nahid Afreen has been prepared without proper research and investigation. It was very sad to see that your journalists don't even know the difference between fatwa and appeal. 
A fatwa is a non-binding legal opinion issued only by a qualified jurist (Mufti) on a request by someone who is unabl

47 
 Dear Indian Express,
I feel sorry to write that your journalists are out there to misinterpret events and present them in a distorted manner just to make some sensational headlines. The news about upcoming singer Nahid Afreen has been prepared without proper research and investigation. It was very sad to see that your journalists don't even know the difference between fatwa and appeal. 
A fatwa is a non-binding legal opinion issued only by a qualified jurist (Mufti) on a request by someone who is unable to understand religious guidelines whereas the leaflet that is being circulated and presented as Fatwa is just a hard copy of an appeal signed by 46 men who believe in religious practices. The merit of appeal can be discussed on many grounds and in many dimensions but first and foremost, an appeal can not be called a legal opinion (Fatwa). 
We expect apology from your side for such a mistake.


48 
 http://indianexpress.com/article/india/fatwa-issued-against-reality-singing-star

In [None]:
word = 'hate'
print(word_count_dict[word], word_entropy_dict[word] + 1/np.log(np.sum(word_count_dict[word])))

In [6]:
def gimme_words_over_a_certain_threshold(threshold, min_df):
    vocabulary = []
    for word in word_entropy_dict:
        if word_entropy_dict[word] < threshold and sum(word_count_dict[word]) > min_df:
            vocabulary.append(word)

    print(threshold, len(vocabulary))
    return vocabulary
    
    
def something(stop_word_threshold, min_df = 3):
    vocabulary_for_train_data = gimme_words_over_a_certain_threshold(stop_word_threshold, min_df)
    if len(vocabulary_for_train_data) < 5:
        return

    dummy_tfidf = TfidfVectorizer(tokenizer=tweeter.tokenize,
                                vocabulary=vocabulary_for_train_data)

    X_train = dummy_tfidf.fit_transform(train_pd.Data)
    y_train = train_pd.Label_num
    X_test = dummy_tfidf.transform(test_pd.Data)
    y_test = test_pd.Label_num
#     count = 10
#     for bleh in dummy_tfidf.vocabulary_:
#         print(bleh, end=', ')
#         count -= 1
#         if count == 0:
#             break
    print()
    print(stop_word_threshold, "Shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
#     X_train, y_train, X_test, y_test = remove_irrelevant_samples(X_train.toarray(), y_train, X_test.toarray(), y_test)

    svm_classifier = SVC(kernel='linear')
    %time svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    print(stop_word_threshold, accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted'))
    print(stop_word_threshold, confusion_matrix(y_test, y_pred))
    print('\n\n')
    
from multiprocessing import Pool
process_pool = Pool(processes=50)

for j in range(1, 560, 5):
    process_pool.apply_async(something, args=(j/100, 0))
# something(8.5)

process_pool.close()
process_pool.join()
print("Done")

0.06 0
0.21 0
0.16 0
0.01 0
0.41 0
0.81 1
0.36 0
0.26 0
0.86 2
0.11 0
0.31 0
1.01 7
0.96 4
1.11 16
1.36 145
1.61 542
0.46 0
1.06 8
0.71 1
1.16 29
1.46 282
2.26 2264
1.66 637
2.31 2392
1.96 1342
2.41 2732
0.76 1
0.91 4
1.51 345
2.11 1784
2.86 4027
1.81 973
2.51 3017
1.56 442
1.31 102
2.16 1960
0.51 0
0.56 0
2.71 3620
2.81 3900
2.46 2851
2.91 4163
1.26 68
2.56 3143
1.41 204
1.21 48
3.16 4867
3.06 4731
3.01 4487
2.61 3319
2.36 2594
1.76 867
1.86 1086
1.71 748
2.01 1491
2.66 3465
2.06 1654
2.21 2096
1.91 1210
2.76 3785
2.96 4380
3.26 5122
3.11 4867
0.61 0
0.66 1
3.36 5368
3.31 5321
3.21 5122
3.41 5448
3.46 5744

2.71 Shape:  (15001, 3620) (15001,) (916, 3620) (916,)

1.61 Shape:  (15001, 542) (15001,) (916, 542) (916,)

1.96 Shape:  (15001, 1342) (15001,) (916, 1342) (916,)


2.31 Shape:  (15001, 2392) (15001,) (916, 2392) (916,)
2.86 Shape:  (15001, 4027) (15001,) (916, 4027) (916,)

2.16 Shape:  (15001, 1960) (15001,) (916, 1960) (916,)


2.76 Shape:  (15001, 3785) (15001,) (916, 3785) (

  'precision', 'predicted', average, warn_for)


1.06 0.6877729257641921 0.5605393832230803
1.06 [[630   0   0]
 [142   0   0]
 [144   0   0]]



3.51 5744
CPU times: user 3.23 s, sys: 1.25 s, total: 4.48 s
Wall time: 7.82 s


  'precision', 'predicted', average, warn_for)


1.01 0.6877729257641921 0.5605393832230803
1.01 [[630   0   0]
 [142   0   0]
 [144   0   0]]



3.56 5960

3.51 Shape:  (15001, 5744) (15001,) (916, 5744) (916,)

3.56 Shape:  (15001, 5960) (15001,) (916, 5960) (916,)
CPU times: user 8.45 s, sys: 1.42 s, total: 9.87 s
Wall time: 15.1 s


  'precision', 'predicted', average, warn_for)


1.11 0.537117903930131 0.5231388628485344
1.11 [[437 193   0]
 [ 87  55   0]
 [ 85  59   0]]



3.61 6400

3.61 Shape:  (15001, 6400) (15001,) (916, 6400) (916,)
CPU times: user 15.5 s, sys: 2.36 s, total: 17.9 s
Wall time: 26.4 s


  'precision', 'predicted', average, warn_for)


1.16 0.517467248908297 0.5109484534486924
1.16 [[410 220   0]
 [ 78  64   0]
 [ 81  63   0]]



3.66 6400
CPU times: user 20.8 s, sys: 1.94 s, total: 22.8 s
Wall time: 30 s

3.66 Shape:  (15001, 6400) (15001,) (916, 6400) (916,)


  'precision', 'predicted', average, warn_for)


1.21 0.47707423580786024 0.4824395505228182
1.21 [[368 262   0]
 [ 73  69   0]
 [ 71  73   0]]



3.71 6442

3.71 Shape:  (15001, 6442) (15001,) (916, 6442) (916,)
CPU times: user 29.7 s, sys: 1.46 s, total: 31.2 s
Wall time: 44.5 s


  'precision', 'predicted', average, warn_for)


1.26 0.527292576419214 0.517277323900262
1.26 [[416 214   0]
 [ 75  67   0]
 [ 86  58   0]]



3.76 6442

3.76 Shape:  (15001, 6442) (15001,) (916, 6442) (916,)
CPU times: user 1min 14s, sys: 1.5 s, total: 1min 16s
Wall time: 1min 51s


  'precision', 'predicted', average, warn_for)


1.31 0.49890829694323147 0.5001572756996882
1.31 [[377 253   0]
 [ 62  80   0]
 [ 71  73   0]]



3.81 6442

3.81 Shape:  (15001, 6442) (15001,) (916, 6442) (916,)
CPU times: user 1min 45s, sys: 1.68 s, total: 1min 46s
Wall time: 2min 32s
1.36 0.509825327510917 0.515767931750866
1.36 [[381 246   3]
 [ 58  83   1]
 [ 64  77   3]]



3.86 6442

3.86 Shape:  (15001, 6442) (15001,) (916, 6442) (916,)
CPU times: user 2min 20s, sys: 1.95 s, total: 2min 22s
Wall time: 3min
1.41 0.5131004366812227 0.5218682512011191
1.41 [[381 244   5]
 [ 57  84   1]
 [ 61  78   5]]



3.91 6596

3.91 Shape:  (15001, 6596) (15001,) (916, 6596) (916,)
CPU times: user 3min 8s, sys: 1.71 s, total: 3min 10s
Wall time: 3min 59s
CPU times: user 3min 15s, sys: 2.44 s, total: 3min 18s
Wall time: 4min 4s
1.56 0.5393013100436681 0.5740776554857956
1.56 [[368 239  23]
 [ 49  90   3]
 [ 35  73  36]]



3.96 6596

3.96 Shape:  (15001, 6596) (15001,) (916, 6596) (916,)
1.71 0.5436681222707423 0.5787609741077961
1.71 [[368 2


5.36 Shape:  (15001, 9796) (15001,) (916, 9796) (916,)
CPU times: user 3min 57s, sys: 1.73 s, total: 3min 59s

Wall time: 5min 56s
5.46 Shape:  (15001, 9796) (15001,) (916, 9796) (916,)
2.56 0.5600436681222707 0.5973684982660223
2.56 [[366 212  52]
 [ 43  83  16]
 [ 28  52  64]]



5.56 9796

5.51 Shape:  (15001, 9796) (15001,) (916, 9796) (916,)
2.26 0.5698689956331878 0.6044473288264998
2.26 [[373 199  58]
 [ 41  85  16]
 [ 30  50  64]]




5.56 Shape:  (15001, 9796) (15001,) (916, 9796) (916,)
CPU times: user 4min 7s, sys: 2.1 s, total: 4min 9s
Wall time: 6min 1s
CPU times: user 4min 9s, sys: 1.94 s, total: 4min 11s
Wall time: 6min 3s
CPU times: user 4min 5s, sys: 1.82 s, total: 4min 6s
Wall time: 6min 1s
CPU times: user 4min 13s, sys: 2.01 s, total: 4min 15s
Wall time: 6min 2s
2.46 0.5665938864628821 0.6023101215632166
2.46 [[371 202  57]
 [ 43  85  14]
 [ 27  54  63]]



CPU times: user 4min 8s, sys: 2.53 s, total: 4min 10s
Wall time: 5min 31s
CPU times: user 4min 9s, sys: 1.89 s

5.11 [[370 206  54]
 [ 43  81  18]
 [ 23  56  65]]



4.91 0.5633187772925764 0.601949759621957
4.91 [[370 206  54]
 [ 43  81  18]
 [ 23  56  65]]



4.96 0.5633187772925764 0.601949759621957
4.96 [[370 206  54]
 [ 43  81  18]
 [ 23  56  65]]



5.16 0.5633187772925764 0.601949759621957
5.16 [[370 206  54]
 [ 43  81  18]
 [ 23  56  65]]



5.06 0.5633187772925764 0.601949759621957
5.06 [[370 206  54]
 [ 43  81  18]
 [ 23  56  65]]



CPU times: user 4min 13s, sys: 2.88 s, total: 4min 16s
Wall time: 4min 38s
5.41 0.5644104803493449 0.6028649904389555
5.41 [[371 207  52]
 [ 43  81  18]
 [ 24  55  65]]



Done


from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score


# #NAG
# for row_number in range(len(train_pd)):
#     row = train_pd.iloc[row_number]
#     for word in set(tweeter.tokenize(row.Data)):
#         if word not in word_set:
#             word_set.add(word)
#             word_count_dict[word] = [0, 0, 0]
#         word_count_dict[word][row.Label_num-1] += 1

from functools import reduce
def entropy(x):
    if x == 0:
        return 0
    return x*np.log(x)

# word_entropy_dict = {}
# a = []
# b = []    


word_count_dict = {}
word_set = set()
word_entropy_dict = {}

def populate_word_count(vect, data, classy):
    global word_count_dict, word_set, word_entropy_dict
    vocab = vect.vocabulary_
#     ordered_dict = {}
    feature_count_list = vect.transform(data).toarray().sum(axis=0)
    for char in vocab:
        if char not in word_set:
            word_set.add(char)
            word_count_dict[char] = [0, 0, 0]
        word_count_dict[char][classy-1] += feature_count_list[vocab[char]]
#         vocab[char] = feature_count_list[vocab[char]]
#     for i in range(feature_count_list.shape[0]):
#         maxy = max(vocab.values())
#         character_for_maxy = reverse_search_dictionary(vocab, maxy)
#         vocab[character_for_maxy] = -1
#         ordered_dict[character_for_maxy] = i
# #         max_index = np.argmax(feature_count_list)
# #         ordered_dict[reverse_search_dictionary(vocab, max_index)] = max_index
# #         feature_count_list[max_index] = -1
#     return ordered_dict
    

def populate_entropy_dict(count_vect):
    global word_entropy_dict, word_count_dict, word_set
    from copy import deepcopy
    #declare CountVectorizer() 
    class_1_vectorizer = deepcopy(count_vect)
    class_2_vectorizer = deepcopy(count_vect)
    class_3_vectorizer = deepcopy(count_vect)
    
    #fit
    class_1_vectorizer.fit(class_wise_train_data[1].Data)
    class_2_vectorizer.fit(class_wise_train_data[2].Data)
    class_3_vectorizer.fit(class_wise_train_data[3].Data)
    
    #get the features in a ordered manner
    word_set = set()
    word_count_dict = {}
    word_entropy_dict = {}
    populate_word_count(class_1_vectorizer, class_wise_train_data[1].Data, 1)
    populate_word_count(class_2_vectorizer, class_wise_train_data[2].Data, 2)
    populate_word_count(class_3_vectorizer, class_wise_train_data[3].Data, 3)
    
    for word in word_count_dict:
        total_words = np.sum(word_count_dict[word])
        word_entropy_dict[word] = np.abs(np.sum(list(map(lambda x: entropy(x/total_words), np.array(word_count_dict[word])))))


def get_vocabulary_without_stopwords_for_training_data(vect, threshold):
    #get order of features
    populate_entropy_dict(vect)
#     vocabulary = []
#     for word in word_entropy_dict:
#         if word_entropy_dict[word] < threshold and sum(word_count_dict[word]) > min_df:
#             vocabulary.append(word)
    listy = []
    for word in word_entropy_dict:
        if word_entropy_dict[word] < threshold and sum(word_count_dict[word]) > min_df:# and word_entropy_dict[word] > 0:
            listy.append( (word, word_entropy_dict[word]))
    return pd.DataFrame(listy, columns=['words', 'entropy']).sort_values('entropy').head(max_features)['words']
    
    
def remove_irrelevant_samples(X_train, y_train, X_test, y_test):
    thresh = 0
    train_bool_array = (np.sum(X_train, axis=1) > thresh)
    test_bool_array = (np.sum(X_test, axis=1) > thresh)
#     print(train_bool_array)
    return X_train[train_bool_array], y_train[train_bool_array], X_test[test_bool_array], y_test[test_bool_array]

#parameters
ngram_range = (4, 4)
# token_pattern = '[^\w+\s+]' #punctuation
# token_pattern = '[A-Z][A-Z]'
#TfidfVectorizer(max_features=max_features, min_df=3),
min_df = 0
max_features = 20000
tokenizer = TweetTokenizer().tokenize

#lda info
# from lda import LDA
# from scipy.sparse import hstack
# lda_vect = LDA(n_topics=190, n_iter=100)
# count_vect = CountVectorizer()

# count_train = count_vect.fit_transform(train_pd['Data'])
# lda_train = lda_vect.fit_transform(count_train)
# lda_test = lda_vect.transform(count_vect.transform(test_pd['Data']))


def something(stop_word_threshold):
    vocabulary_for_train_data = get_vocabulary_without_stopwords_for_training_data(

                            CountVectorizer(tokenizer=tokenizer,
                                            ), 
                            threshold=stop_word_threshold)


    tfidf_vect = TfidfVectorizer(vocabulary=vocabulary_for_train_data)
    X_train = tfidf_vect.fit_transform(train_pd['Data'])
    y_train = train_pd['Label_num']

    X_test = tfidf_vect.transform(test_pd['Data'])
    y_test = test_pd['Label_num']

    svm = SVC(kernel='linear')
    svm.fit(X_train, train_pd['Label_num'])

    y_pred = svm.predict(X_test)
    print(stop_word_threshold,len(word_set), "-->", X_train.shape, ":", accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='weighted'))
    print(stop_word_threshold, confusion_matrix(y_test, y_pred))
    print('\n\n')
    
from multiprocessing import Pool
process_pool = Pool(processes=40)

# for j in range(1000, 1200, 5):
#     process_pool.apply_async(something, args=(j*0.001,))
something(8.5)

process_pool.close()
process_pool.join()
print("Done")