In [79]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import save_npz, load_npz
import json
import scipy as sp

In [25]:
cv_text = load_npz('cv_text_ln_sample.npz') #load sparse
with open('vocab_ln_sample.txt', 'r') as f:
    vocab = json.loads(f.read()) #load vocab dict
labels = pd.read_csv('country_labels_ln_sample.csv', index_col = 0).reset_index(drop=True) #load labels
    
print(f"Sparse matrix file size: {cv_text.data.nbytes /1e6:.2f} mb")
print(f"Sparse matrix dimensions: {cv_text.shape}")

Sparse matrix file size: 341.03 mb
Sparse matrix dimensions: (217164, 207500)


# Create bootstrapped sample

In [41]:
def create_bs(label, bs_size = 50):
    bs_sample = np.random.choice(labels[labels.country == label].index, size = bs_size)

    for index in bs_sample:
        sample_vectors = list()
        sample_vectors.append(cv_text[index,:])
    return sum(sample_vectors)

In [129]:
countries = labels.country.unique()
bs_data = dict()
for country in countries:
    bs_data[country] = [create_bs(country) for k in range(1000)] #create 1000 bs samples for each country

#create bs cv dataframe and labels
bs_cv = sp.sparse.vstack([sp.sparse.vstack(bs_data[country]) for country in countries])
bs_labels = np.array([x for nested in [1000 * [country] for country in countries] for x in nested])

# Test as country classifier

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [200]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(bs_cv,bs_labels,
                                                test_size = .3, stratify = bs_labels)

rfc = RandomForestClassifier()
rfc.fit(Xtrain, Ytrain.reshape(-1))

correct = (rfc.predict(Xtest) == Ytest).sum()

# prob = lambda k,total: sp.special.comb(total,k)*((.1)**(k))*((.9)**(total-k))

print(f"Number of correct classifications: {correct} out of {Xtest.shape[0]}")
print()
print(f"The probability of getting at least {correct} out of {Xtest.shape[0]} correct, \n assuming binomial:  {1 - sp.stats.binom.cdf(correct,Xtest.shape[0], .1)}")

Number of correct classifications: 2482 out of 3000

The probability of getting at least 2482 out of 3000 correct, 
 assuming binomial:  0.0


In [201]:
[vocab[str(k)] for k in rfc.feature_importances_.argsort()[20::-1]]

['phaseof',
 'phaseobserveintangible',
 'phasein',
 'phasefinde',
 'phasedown',
 'phased',
 'pharynx',
 'pharyngitis',
 'pharyngeal',
 'pharos',
 'pharmerge',
 'pharonic',
 'pharo',
 'pharnext',
 'pharmtech',
 'pharmswellbio',
 'pharmstandard',
 'pharmsintez',
 'pharmore',
 'pharmion',
 '____________________________________________person']

## Repeat with tfidf

In [122]:
from sklearn.feature_extraction.text import TfidfTransformer

In [165]:
tfidf = TfidfTransformer()
bs_tfidf = tfidf.fit_transform(bs_cv)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(bs_tfidf,bs_labels,
                                                test_size = .3, stratify = bs_labels)

rfc = RandomForestClassifier()
rfc.fit(Xtrain, Ytrain.reshape(-1))

correct = (rfc.predict(Xtest) == Ytest).sum()

print(f"Number of correct classifications: {correct} out of {Xtest.shape[0]}")
print()
print(f"The probability of getting at least {correct} out of {Xtest.shape[0]} correct, \n assuming binomial:  {1 - sp.stats.binom.cdf(correct,Xtest.shape[0], .1)}")

Number of correct classifications: 2463 out of 3000

The probability of getting at least 2463 out of 3000 correct, 
 assuming binomial:  0.0


In [197]:
[vocab[str(k)] for k in rfc.feature_importances_.argsort()[20::-1]]

['phillipa',
 'phillie',
 'philli',
 'phill',
 'philjame',
 'philistinism',
 'philistine',
 'philipsen',
 'philippus',
 'philippino',
 'phili',
 'philippinesbecause',
 'philippines',
 'philippine',
 'philipine',
 'philiphave',
 'philine',
 'phililppine',
 'philidor',
 'philic',
 '____________________________________________person']

In [190]:
vocab_swap = {k: v for v, k in vocab.items()}

In [198]:
vocab_swap[vocab['135257']]

'135257'

Still mostly identifying rare words; should filter out when doing CV