In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import save_npz, load_npz
import json

In [2]:
data = pd.read_csv('lemmatized_ln.csv', index_col = 0)
print(f"Total dataframe size: {sys.getsizeof(data)/1e6:.2f} mb")

empty_articles = data[data.content_lem.isnull()]
print(f"Number of empty_articles from sample: {empty_articles.shape[0]}")
print(f"Average reported word length of empty articles: {empty_articles.wordCount.mean():.2f}")

Total dataframe size: 818.676156
Number of empty_articles from sample: 36
Average reported word length of empty articles: 6.58


Count vectorize and save:

In [3]:
x = data[data.content_lem.notnull()] #strip out empty articles

countv = CountVectorizer()
cv_text = countv.fit_transform(x.content_lem) 
save_npz('cv_text_ln_sample.npz', cv_text) #export as npz

vocab_swap = {v: k for k, v in countv.vocabulary_.items()} #vocab dictionary
with open('vocab_ln_sample.txt', 'w') as f:
    f.write(json.dumps(vocab_swap)) #save as json

#Sample encodings
print(list(countv.vocabulary_.keys())[:10])
print(list(countv.vocabulary_.values())[:10])

print(f'Observations: {cv_text.shape[0]}')
print(f'Vocab size: {cv_text.shape[1]}')

In [20]:
x[['target','country']].to_csv('country_labels_ln_sample.csv')

Test loads:

In [8]:
cv_text = load_npz('cv_text_ln_sample.npz') #load sparse
with open('vocab_ln_sample.txt', 'r') as f:
    vocab = json.loads(f.read()) #load vocab dict
labels = pd.read_csv('country_labels_ln_sample.csv', index_col = 0) #load labels
    
print(f"Sparse matrix file size: {cv_text.data.nbytes /1e6:.2f} mb")

Sparse matrix file size: 341.03 mb
