In [4]:
import pandas as pd
import sklearn

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
df = pd.read_csv("lemmatized_hp_sample.csv")

In [39]:
df.columns.tolist()

['Unnamed: 0', 'target', 'country', 'wordCount', 'content_lem']

In [40]:
print(f'df dimensions: {df.shape}')
print(f'Distinct countries: {df.country.unique().shape[0]}')
print(f'average word count per article: {df.wordCount.mean():.1f}')
print(f'min word count: {df.wordCount.min()}, max word count: {df.wordCount.max()}')

df dimensions: (209900, 5)
Distinct countries: 10
average word count per article: 902.3
min word count: -32092, max word count: 30324


In [41]:
df[df.content_lem.isnull()]

Unnamed: 0.1,Unnamed: 0,target,country,wordCount,content_lem
7556,7556,1,Australia,1,
12403,12403,1,Australia,1,
14907,14907,1,Australia,39,
22873,22873,1,Australia,1,
29163,29163,1,Australia,1,
34684,34684,1,Australia,1,
35348,35348,1,Australia,1,
41099,41099,1,Australia,1,
41710,41710,1,Australia,3,
42968,42968,1,Australia,14,


In [42]:
df = df.dropna()

In [43]:
df.isna().sum()

Unnamed: 0     0
target         0
country        0
wordCount      0
content_lem    0
dtype: int64

In [44]:
countv = CountVectorizer()
cv_text = countv.fit_transform(df.content_lem)
vocab = {v: k for k, v in countv.vocabulary_.items()}

In [64]:
vocab[1000]

'accountex'

In [45]:
#Sample encodings
print(list(countv.vocabulary_.keys())[:10])
print(list(countv.vocabulary_.values())[:10])

print(f'Observations: {cv_text.shape[0]}')
print(f'Vocab size: {cv_text.shape[1]}')

['monthly', 'sector', 'report', 'emerge', 'company', 'index', 'strengthen', 'above', 'move', 'average']
[112611, 158363, 148205, 56107, 36726, 87164, 170843, 545, 113482, 13462]
Observations: 209864
Vocab size: 204227


In [47]:
import numpy as np

In [48]:
#XXX Sanity check: word counts according to count vectorizer do not seem to agree with the word counts provided. 
#This is expected to some degree, and in most cases it looks small, but it does have some extreme values
cv_counts = np.array(cv_text.sum(axis = 1)).reshape(-1) #sum counts of all words in each article
wc_sanity = pd.concat([labels['wordCount'],pd.Series(cv_counts, name = 'cv_wordCount')], axis = 1) #pair with wordCount given in csv
wc_sanity['diff'] = wc_sanity['wordCount'] - wc_sanity['cv_wordCount']
print(f"Average difference between count vector sum and wordCount: {wc_sanity['diff'].mean():.2f}")
print(f"Max difference between count vector sum and wordCount: {wc_sanity['diff'].max()}")
print(f"Min difference between count vector sum and wordCount: {wc_sanity['diff'].min()}")


wc_sanitycheck = lambda k: print(f"The count vector sum is is within {int(k)} of the wordCount for {wc_sanity[np.abs(wc_sanity['diff']) < k].shape[0]/wc_sanity.shape[0]*100:.2f}% of the articles")

wc_sanitycheck(1e2)
wc_sanitycheck(1e3)
wc_sanitycheck(50)

Average difference between count vector sum and wordCount: 297.44
Max difference between count vector sum and wordCount: 29983.0
Min difference between count vector sum and wordCount: -32405.0
The count vector sum is is within 100 of the wordCount for 15.82% of the articles
The count vector sum is is within 1000 of the wordCount for 75.63% of the articles
The count vector sum is is within 50 of the wordCount for 8.31% of the articles


In [65]:
top_words = np.array(cv_text.sum(axis=0).argsort()).reshape(-1)

In [81]:
# pull indices with 300 largest numbers
# pass indices to reverse dictionary to see what words go with the indices

In [66]:
top_words[:300]

array([204226, 159093,  77042,  77037,  77036,  77035, 159099, 159100,
       159105,  77028, 159090,  77027, 159110,  77022, 159111,  77018,
        77017,  77016, 159113, 159114,  77011, 159109,  77048,  77050,
        77052,  77087,  77085, 159068,  77079,  77078,  77075,  77074,
        77072, 159073, 159076,  77067,  77066,  77065,  77064,  77063,
        77061,  77060, 159078, 159086,  77055,  77053,  77010,  77009,
        77008,  77007,  76966,  76965,  76964,  76963,  76962,  76960,
       159141, 159142, 159144, 159146,  76952,  76951,  76950,  76948,
        76945,  76944, 159149, 159153,  76938,  76937,  76936,  76969,
       159066,  76971,  76973,  77006,  77005,  77004,  77003,  77002,
        77001,  76997,  76996, 159118,  76994, 159119,  76989,  76987,
        76984,  76983, 159128, 159131,  76978,  76977,  76976, 159132,
        76972,  76934,  77089,  77091,  77187,  77184,  77183,  77180,
        77177, 159026, 159027, 159028,  77172, 159021,  77171,  77168,
      

In [75]:
word_counts_array = np.array(cv_text.sum(axis=0)).reshape(-1)

In [80]:
[(vocab[k], word_counts_array[k]) for k in list(top_words[-300:])][::-1]

[('the', 7593586),
 ('of', 5431564),
 ('be', 4318159),
 ('in', 3823069),
 ('to', 3790491),
 ('and', 3418395),
 ('year', 1731798),
 ('on', 1489292),
 ('for', 1440827),
 ('have', 1354287),
 ('price', 1194266),
 ('from', 1058661),
 ('with', 1002181),
 ('at', 938169),
 ('by', 821423),
 ('share', 805168),
 ('as', 746400),
 ('stock', 722502),
 ('section', 707690),
 ('total', 700305),
 ('month', 641325),
 ('up', 635800),
 ('average', 633748),
 ('it', 630854),
 ('asset', 617496),
 ('that', 614845),
 ('ago', 570747),
 ('this', 546433),
 ('day', 524803),
 ('market', 521265),
 ('past', 484694),
 ('change', 482795),
 ('return', 452268),
 ('volume', 451561),
 ('its', 437242),
 ('time', 433828),
 ('week', 426604),
 ('an', 423976),
 ('or', 413035),
 ('down', 412922),
 ('high', 401367),
 ('say', 401212),
 ('last', 379163),
 ('index', 374668),
 ('will', 364995),
 ('value', 363928),
 ('he', 360862),
 ('we', 358670),
 ('low', 349975),
 ('director', 349318),
 ('all', 327066),
 ('than', 323038),
 ('sector'