In [111]:
import pandas as pd
from utils import read_file, clean_twitter, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import spacy
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import pickle

In [112]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [115]:
with open('pseudodocs.pkl', 'rb') as f:
    docs, labels, vocab = pickle.load(f)

In [116]:
eng_docs = []
for i in range(len(docs)):
    word_indices = docs[i]
    j = 0
    text = []
    while word_indices[j] != 0:
        text.append(vocab[word_indices[j]])
        j += 1
    eng_docs.append(" ".join(text))

In [117]:
labels = labels.argmax(axis=1)

In [118]:
gendf = pd.DataFrame(columns=['text', 'label'])
gendf['text'] = eng_docs
gendf['label'] = labels

In [119]:
with open('stopwords.txt', 'r') as f:
    lines = f.readlines()
stopwords = [w.strip() for w in lines]

In [120]:
count_vectorizer = CountVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,1),
                                   stop_words=stopwords)

In [121]:
count = count_vectorizer.fit_transform(gendf['text'])
features = np.array(count_vectorizer.get_feature_names())
freq = count.copy()
count[count > 0] = 1

In [154]:
label=0
class_docs = count[gendf[gendf.label == label].index.to_numpy()]
rel_doc_freq = np.array(class_docs.sum(axis=0)/class_docs.shape[0])[0]
avg_freq = np.array(freq[gendf[gendf.label == label].index.to_numpy()].sum(axis=0)/class_docs.shape[0])[0]

In [155]:
rankingdf = pd.DataFrame(columns=['word', 'rel_doc_freq'])
rankingdf['word'] = features
rankingdf['rel_doc_freq'] = rel_doc_freq
rankingdf['avg_freq'] = avg_freq
rankingdf['idf'] = np.log(np.array(count.shape[0]/count.sum(axis=0))[0])

In [156]:
scaler = MinMaxScaler()
scaler.fit(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf[['rel_doc_freq','idf', 'avg_freq']] = scaler.transform(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf['comb'] = np.cbrt(rankingdf['rel_doc_freq'] * rankingdf['idf'] * rankingdf['avg_freq'])

In [157]:
rankingdf.sort_values(by=['comb'], ascending=False).head(50)

Unnamed: 0,word,rel_doc_freq,avg_freq,idf,comb
5060,vibe,1.0,1.0,0.257086,0.635857
1171,counts,0.777778,0.78,0.300635,0.567105
1707,excitement,0.688889,0.72,0.329818,0.546914
4092,sarcasm,0.666667,0.7,0.336218,0.539356
1053,communicate,0.622222,0.6,0.349682,0.507291
1610,embrace,0.533333,0.6,0.379767,0.495323
2203,hashtag,0.555556,0.5,0.364145,0.465934
3199,niall,0.511111,0.48,0.388073,0.456622
2301,homework,0.488889,0.5,0.388073,0.45607
1529,drown,0.466667,0.52,0.379767,0.451692


In [None]:
doc = {0:[75090,13237,55418,189,75423,67638,49895,19745,10135,78408,3576,44459,56369,67677,3597,34072,79736,60433,68724,58987,77517,23840,16766,82829,35506,58248,13555,47210,12596,13284,7696,77068,55782,42362,60657,4888,51719,82120,80544,68433,10832,48758,18714,31147,38874,32697,81695,30591,6653,57215,47192,21751,15820,58936,47664,58515,63034,23311,33041,12917,4787,80142,20602,68801,9722,55025,73528,6159,48527,75503,41458,23051,51231,36599,36516,76701,75065,11918,44142,50752,42354,72965,34207,15816,29852,64406,45658,44277,32615,12587,68341,49408,69515,43752,25564,34988,15615,1215,29909,59854,77224,19433,67855,35518,20880,26230,7597,62793,36403,37524,18034,22048,78974,59904,41366,57864,76792,63797,64793,51832,82372,74736,7005,36880,18885,20674,60313,23981,14122,67676,62835,32780,71113,70739,31176,17200,41985,31679,67055,40653,61205,41683,13177,70220,15794,15993,60435,76019,26423,77204,44927,17100,54029,24147,75916,33790,32195,75958,64989,52245,45339,63732,68527,23802,34567,74759,28913,67128,27035,8275,50192,29842,24381,4987,12111,67622,4921,79651,69700,15463,139,46144,63601,69912,2522,47947,44897,25784,76803,55967,46168,29346,66477,22076,63113,49663,49693,21523,61157,69076,78294,14196,34935,10344,31934,35803,79479,70688,37732,8914,31664,57449,80497,42571,5619,12757,65963,75233,18295,47921,3363,66839,47685,52257,43874,79113,33608,69185,423,21731,28161,52170,46036,22982,33366,15816,77166,79974,826,63448,74223,74500,36054,81430,17963,40067,10793,66655,1636,28542,41580,53461,24251,8584,47279,59771,16616,5280,45663,82124,82393,37497,51157,20905,82038,3383,17514,1537,35742,55040,48937,45238,10786,51913,22469,59459,42570,8855,11236,32263,18042,6680,67129,24493,61805,80675,44373,61044,15736,60352,59176,59629,61406,33562,73424,45204,78733,62609,61707,49297,80631,7201,13836,76588,73285,17361,81105,29221,9930,1084,74701,33354,75320,41070,1383,64054,54883,69462,72665,371,41488,55781,4411,57495,73258,44086,41933,43723,57898,18877,8281,13083,65519,73013,25629,77650,11907,276,41497,8581,83246,137,31826,65254,61046,61478,31937,46109,3842,54373,32483,40516,18235,37005,64884,83492,14135,21532,60482,82888,27607,80838,66682,17885,35791,37142,11416,18963,74039,71378,60345,68959,40709,82508,53258,29368,54732,18040,9940,57584,67479,14185,28383,71696,21302,76503,81935,29736,58955,63831,2103,47677,720,50130,37012,54958,54997,46771,16086,9256,62609,73313,55996,17779,46103,7537,52440,620,60881,78943,5005,79792,50854,16439,42699,27241,56800,13607,80566,20680,78620,13505,43126,30767,68467,7354,44574,23803,57091,80663,20606,25530,3162,41957,70858,52142,49204,35575,8131,23032,75956,18079,37097,58598,20457,27175,77776,29915,61570,36000,9679,11373,49190,69773,70802,54552,7272,48258,49727,54709,33716,83492,60792,69582,65145,44669,62743,70095,45339,7098,66483,10458,57971,40445,45333,61585,16742,32512,19535,29520,23978,66400,54526,20229,21523,47263,69502,75425,66919,52543,57723,62294,13951,77900,64783,32614,20451,71478,36170,31203],
1:[11616,11188,48955,52959,53000,40212,51821,74634,61852,78700,1640,9976,50635,35777,71198,32630,66752,2197,61479,4428,43573,78566,32459,42228,641,17107,78871,70728,5557,23074,24895,66397,64309,1201,52567,45937,55194,11552,71126,63623,16329,35111,30846,75173,62769,52015,83072,26621,27587,13133,35743,4700,44397,74080,11745,82946,10453,77611,37482,62225,44002,33419,68400,20782,11967,39989,24775,26265,9259,80984,63796,66874,47737,13291,51170,71955,13366,71320,35696,72542,12254,10020,15467,67183,64706,22447,38079,32431,75285,55470,74887,33649,38796,26773,20277,74046,41898,65156,82333,33879,46265,44532,50973,12057,30146,24987,4047,38438,37991,65825,56934,29054,20603,56824,63172,58307,4569,57189,75731,68378,32190,41285,51533,39102,11183,14036,82612,4280,68986,69223,2478,69101,9147,21526,37349,21659,50419,66981,80933,29742,79153,58397,79196,26237,23569,74469,55292,33952,26818,57663,51324,20125,73511,3590,26821,57241,14806,9691,72169,21763,23135,51494,58883,82830,41571,67718,5963,25682,6520,47703,77619,52010,17108,69919,5167,69647,10812,30652,24033,42575,21415,69521,78776,59876,55483,75147,56541,3719,59668,78642,53245,76678,72111,16886,46672,19154,47645,55938,37540,55315,81790,35743,31038,45664,62094,47891,42090,36182,65784,59044,45723,68503,10174,703,3495,39768,10443,40589,26710,38842,38202,17993,31700,34995,60800,46831,46799,23037,44737,19749,30625,38967,77265,46317,57706,15429,31532,88,77472,71712,27867,83265,59387,46222,30181,54157,66712,47510,40672,19684,44229,57102,45309,39740,64796,29488,2351,77577,72502,19691,79212,28051,68627,57706,48934,60528,17485,72922,62576,64268,51051,82574,28320,31928,83235,3285,31951,52960,33121,34886,2289,80984,61736,3744,49027,23627,73577,49840,49407,51725,66694,22998,53189,29979,82656,14983,36796,15429,6935,77828,48343,3506,54760,56202,54017,33919,34563,16254,81357,5180,79782,58535,44002,41728,8739,79105,12924,27893,4926,72922,50271,29291,61928,13659,13369,70847,27141,10719,2243,50427,36721,64928,77393,4710,69174,26802,5675,1148,40078,47738,65117,33797,14179,82528,57010,42193,47148,10132,52074,48453,46384,55030,54542,13526,5062,43089,54704,55998,77827,65610,28596,30775,6538,2190,36483,33909,13593,22492,63518,15356,63268,13617,64599,58345,45641,4182,1974,56489,34924,42560,55445,15983,52576,19635,54346,13082,64521,38294,70475,62470,2351,57179,69084,71031,8145,55977,33510,26199,18349,46515,34406,63712,5752,12410,43945,15561,45415,48343,54311,41850,23586,21954,41513,65654,16734,65233,58649,34071,6231,78457,54404,8276,13649,41344,82946,40068,62725,64446,65942,63816,435,71943,15998,4185,72361,25559,30240,47249,64241,33697,34794,29501,5125,8488,70080,17149,72448,23652,82760,80519,70131,1726,37154,67738,15119,39143,2995,60319,32932,42946,20283,54550,35907,21299,13659,48317,68039,60096,9747,62165,79559,67474,61414,43065,24021,77969,61939,69679,45980,80535,8298,25491,74622,81112,60550,81419,20173,20864,21391,9644,82632,7552,48677,2904,30319,31130,48584,54406,32294,69928],
2:[62344,72452,67908,46490,21930,13012,79511,7336,29775,47052,54171,70099,38055,17289,20974,65114,65368,21826,2615,73280,73427,56054,21717,19867,79062,17236,48757,65324,58429,7891,83185,50695,67774,27758,49142,72496,36083,40203,18925,4813,18155,73489,51791,19733,81531,52182,42230,5104,24454,6610,36833,35646,81074,55771,42668,82899,78452,48746,43623,25801,44501,36689,71286,37803,33216,28979,76671,43196,6012,52492,21335,23337,5662,67110,2387,12695,68096,66349,8165,60336,40841,42632,53918,16402,10295,74437,10409,5163,32124,11042,29197,23409,25623,79093,63853,54437,40071,14510,77005,45338,21484,42508,64615,50182,1746,65995,14919,63837,23474,42441,23765,59856,20353,56856,51798,72990,4595,15648,63124,56830,2401,59185,11707,65101,19357,79437,48775,55919,1675,37693,72689,30152,78740,77672,80026,82257,51349,75622,48265,52095,56335,31259,33239,30367,33932,6565,63401,48768,77038,30600,53518,25972,37978,1682,42569,48614,76074,36131,9243,52241,31344,61261,53889,14102,22417,35192,44062,9274,25803,32325,33448,23276,66411,68192,9953,58972,72238,49626,67980,4597,32660,50040,62463,20185,31126,23019,4904,78841,23636,82678,8499,73809,43248,5873,2412,58549,52432,79978,24087,7586,81210,59210,9563,17238,64743,1436,57558,9193,70192,22450,54860,72259,16735,9194,20094,9617,5638,73675,54661,8900,62040,77466,54010,81994,70790,51303,14285,30379,21485,18053,39542,45933,54530,11138,7773,58604,3426,75411,61352,45053,34447,14841,39098,8092,83372,44871,78278,57753,53964,70393,30384,49335,9435,81909,50931,76052,38804,48126,25600,73505,5073,76140,78311,16619,37425,64442,17598,63947,44214,68568,44991,35502,59748,79973,36550,58866,39072,51068,41576,41545,5354,33875,49905,29817,73704,29794,67110,2443,38804,57553,42619,36690,78481,66481,71644,10805,21312,10314,49261,69702,9800,12028,27648,6188,36791,12833,59170,5119,8360,7386,48614,4589,54412,19957,67069,42029,58201,58242,5671,53529,11539,31103,33672,39511,22814,23474,37062,23579,36874,11281,60558,67499,3465,76222,75592,56805,38081,76636,49643,64129,38869,43179,78804,22780,2401,26681,14183,57796,77154,11463,15987,67772,37637,35546,7001,36688,15726,42801,69385,52642,20233,23814,10248,12219,72939,71164,52847,83073,32045,39686,7680,73657,13841,40522,29323,81991,5696,45231,80442,61389,4639,39569,30862,54210,38785,24736,39643,60340,56774,40117,55699,9812,3691,80824,38251,31701,76695,47880,34553,19651,46178,20195,54835,70109,7286,15008,8143,82457,27506,43560,7301,54813,8199,50295,77340,56924,670,13008,17019,59463,14752,27734,53856,30416,60786,45437,74864,25032,4474,17047,59936,77582,20316,5757,65090,5643,61603,6405,27622,13708,75733,51800,51347,6969,79384,12681,688,48010,28928,69120,29028,4019,62662,16317,66089,29195,20395,53666,22008,63569,34597,52787,46284,29431,42238,61409,66818,61295,29280,73394,2806,64387,19891,57452,75717,74850,5175,1944,13619,72698,35338,73712,309,67785,40169,50649,35322,7219,24639,6065,62804,7884,40954,57952,4659,56408,25836,49199,18001,59956],
3:[23420,31397,26065,66204,11960,22420,62207,36851,11517,61405,20898,23420,72592,15995,23791,2889,56201,74150,46883,48466,5668,83411,56181,82055,63879,60391,82452,15756,29628,35583,10624,34582,75741,45996,29862,77215,2969,82178,3398,50155,61800,68003,44276,20217,23521,36298,28290,46845,36735,37836,75740,28138,32137,13025,2850,70801,29591,497,13759,20010,61633,3398,25871,62888,16828,5470,8991,71111,25349,54352,47366,9113,31397,51553,9466,60278,1077,12810,38591,23429,17828,40780,45299,59438,32311,39957,33036,33199,57758,14024,66243,55642,3752,77170,67442,25189,56177,16882,55041,9593,8208,41255,30144,37469,49736,2711,16595,43701,52547,44704,63496,50657,66097,15479,61482,76153,65681,2804,8208,8751,74019,36068,59372,31687,79322,72592,73566,21785,57903,68591,19102,32580,9248,17848,53338,50004,73997,62477,55999,3937,32535,68798,1053,66870,33618,61239,14441,37603,81130,14186,81172,44285,73326,47678,20217,26487,63512,38519,71196,20522,51853,36748,40654,53561,70512,55642,16237,74880,60391,17527,7914,79584,6875,60278,53338,47906,62197,45899,28778,62904,11397,4869,77953,1848,75345,77940,49330,40404,49055,54717,28860,46149,82128,31310,51560,29180,15123,18273,81499,71572,13315,66522,4115,51484,80854,6995,15479,23907,8852,59201,33496,6883,60656,47678,44663,57575,74423,48806,80089,80089,14633,39939,33946,41159,54972,37603,2371,83407,8365,68239,67655,77953,37453,53257,45880,75740,75374,27235,40850,15717,45990,35876,42453,50137,25252,39417,53393,7955,47736,65064,51250,70483,46883,79905,46180,76816,57722,77718,33036,45193,8877,5322,3705,50501,40404,47808,50137,29217,51484,37546,59689,62207,23310,28804,55540,30548,18767,27140,47366,52575,562,80024,25959,74975,1321,21320,11614,27510,11372,51393,55997,3956,40936,50875,51907,64093,27262,2958,17527,49704,4115,17848,82521,2488,32590,72618,23429,25959,57181,29437,58802,30297,46256,76158,27597,51958,47585,46120,59161,21798,63321,53542,36009,29628,17674,28946,34726,23907,44159,7913,65694,80558,32209,7914,82039,7805,59597,82128,79776,59558,63082,76271,8852,34726,81738,2464,71507,63577,13872,70678,61544,62760,83319,51853,40261,1601,35157,23448,23137,63737,4018,65898,59204,57810,7327,24590,57437,57435,50884,47021,28534,52036,25210,12923,25306,70218,52226,40293,26526,74280,73995,23008,75386,56620,35465,24004,7071,5668,25804,3297,16215,14645,2380,72093,19179,66364,5123,10054,56475,52226,5102,22403,66522,55075,63392,6803,26847,36009,41780,47152,47021,74627,62143,30144,18174,75003,72569,9249,497,46725,53920,69268,79375,68060,78904,81372,81966,21952,69616,54728,60212,13721,36068,80199,17980,58356,31422,76153,37274,35489,78591,57639,33512,62014,65305,19102,67287,41972,77070,6022,4178,8365,36450,47475,36071,55041,42453,7771,8854,45464,71507,44925,24951,74077,61713,30548,62742,3469,67496,66132,33199,7443,3062,83433,28128,71752,19294,74813,43650,54272,55804,18695,7539,44672,51992,52334,3063,46725,71977,55659,35164,79664,58802,33454,22344,13759,72731,17489,10054,18563]}


In [None]:
for k , docs in doc.items():
    doc[k] = doc[k][:50]

In [None]:
print(doc[3])