## Importing statistical modeling and word count vectorizing libraries

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_predict, cross_val_score
import matplotlib.pyplot as plt


In [3]:
# import all csvs

kjv = pd.read_csv('./my_datasets/kjv_genre_text.csv')
asv = pd.read_csv('./my_datasets/asv_genre_text.csv')
bbe = pd.read_csv('./my_datasets/bbe_genre_text.csv')
wbt = pd.read_csv('./my_datasets/wbt_genre_text.csv')
web = pd.read_csv('./my_datasets/web_genre_text.csv')
ylt = pd.read_csv('./my_datasets/ylt_genre_text.csv')

In [4]:
# start with kjv

kjv.head()

Unnamed: 0,genre,text
0,1,In the beginning God created the heaven and th...
1,1,"And the earth was without form, and void; and ..."
2,1,"And God said, Let there be light: and there wa..."
3,1,"And God saw the light, that it was good: and G..."
4,1,"And God called the light Day, and the darkness..."


# Baseline class percentage 

In [5]:
# Checking if the classes are balanced  

kjv['genre'].value_counts(normalize=True, ascending=True)

8    0.012989
6    0.032376
1    0.049288
7    0.088995
5    0.152333
3    0.181462
4    0.217921
2    0.264637
Name: genre, dtype: float64

# Set X and y 

In [6]:
# assign X and y 

X = kjv['text']
y = kjv['genre']

print(X.shape)
print(y.shape)

(31103,)
(31103,)


# Train/test split 


In [7]:
# train, test, split 

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=51419,
                                                   stratify=y)

In [8]:
# check the shape distribution of the x train and x test


X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23327,), (7776,), (23327,), (7776,))

# Simple TFIDF and Logistic Regression score 

In [9]:
tf = TfidfVectorizer(stop_words = 'english')


In [10]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ])

In [11]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 



0.6965313787235135

In [12]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [13]:
pipe.score(X_train, y_train)

0.7824409482573842

In [14]:
pipe.score(X_test, y_test) 

0.7136059670781894

# Tfidf parameter tuening 

In [15]:
# instantiate Tfidif

def scale_model_evaluate(X, y, model_name='lr', 
                         tokenizer_name='tfidf', 
                         tokenizer = TfidfVectorizer(),
                       model_type=LogisticRegression(), 
                       parameters={'tfidf__max_df': [.1, .2],
                                   'tfidf__min_df': [.09, 1],
                                   'tfidf__ngram_range': (1.3, 1),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__n_jobs': [-2]
                                  }
                        ):
 
    # Pipeline for feature engineering and instantiating model
    pipe=Pipeline(memory=None,
         steps=[(tokenizer_name,tokenizer),
                (model_name,model_type)])
                         
    # Fit the model using parameters for grid search
    grid = GridSearchCV(pipe, param_grid=parameters, cv=5)
    grid = grid.fit(X, y) 
    
    
    
    # Print best attributes
    print(f"For model: {model_type}")
    print(f"The best parameters are: {grid.best_params_}")
    print(f"The best score is: {grid.best_score_:.2f}")
    return grid


In [16]:
scale_model_evaluate(X_train, y_train, 
                     model_name='lr', 
                     tokenizer_name='tfidf',
                     tokenizer = TfidfVectorizer(),
                     parameters={'tfidf__max_df': [.1, .2],
                                 'tfidf__min_df': [.09, 1],
                                 'tfidf__ngram_range':[(1, 1), (1, 2)],
                                 'lr__penalty': ['l1', 'l2'],
                                 'lr__n_jobs': [-1]
                                } )

  " = {}.".format(effective_n_jobs(self.n_jobs)))


For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
The best parameters are: {'lr__n_jobs': -1, 'lr__penalty': 'l2', 'tfidf__max_df': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
The best score is: 0.70


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__max_df': [0.1, 0.2], 'tfidf__min_df': [0.09, 1], 'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__penalty': ['l1', 'l2'], 'lr__n_jobs': [-1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
# JH : how do I extract the most important features to graph ? 

In [17]:
# list(sorted(zip(grid.best_estimator_.named_steps['lr'].coef_[0], grid.best_estimator_.named_steps[
#     'cv'].get_feature_names()), reverse=True))[:20] 

# Multinomial NB and Tfidf 

In [None]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ])

In [18]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 



0.6965313787235135

In [19]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [20]:
pipe.score(X_train, y_train)

0.7824409482573842

In [21]:
pipe.score(X_test, y_test) 

0.7136059670781894

In [42]:
kjv_w  = pd.DataFrame(tvec.fit_transform(kjv).toarray(),
                   columns=tvec.get_feature_names())
kjv_w.head()

Unnamed: 0,genre,text
0,1.0,0.0
1,0.0,1.0


## Doc-2-vec 

Source : https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [30]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import stopwords 

In [31]:
kjv.head()

Unnamed: 0,genre,text
0,1,In the beginning God created the heaven and th...
1,1,"And the earth was without form, and void; and ..."
2,1,"And God said, Let there be light: and there wa..."
3,1,"And God saw the light, that it was good: and G..."
4,1,"And God called the light Day, and the darkness..."


In [32]:
train, test = train_test_split(kjv, test_size=0.3, random_state=1519)

In [33]:
text = kjv['text']

In [34]:
genre = kjv['genre']

In [35]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)

In [37]:
train_tagged.values[100]


TaggedDocument(words=['then', 'said', 'absalom', 'if', 'not', 'pray', 'thee', 'let', 'my', 'brother', 'amnon', 'go', 'with', 'us', 'and', 'the', 'king', 'said', 'unto', 'him', 'why', 'should', 'he', 'go', 'with', 'thee'], tags=[2])

In [38]:
# multi processing 

In [39]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [40]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2402293.60it/s]


In [41]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha


100%|██████████| 21772/21772 [00:00<00:00, 2368276.84it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3101425.98it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3068597.29it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3311396.70it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3188602.49it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3304207.65it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3301818.23it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2708297.84it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3244222.92it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3304327.21it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3023887.77it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3284479.61it/s]
100%|██████████| 21772/21772 [00:00<00:00, 1804711.20it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3487298.05it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3314401.38it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3405496.43it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3341324.06it/

CPU times: user 41.4 s, sys: 8.17 s, total: 49.6 s
Wall time: 26.2 s


# Buliding the final vector feature for the classifier

In [42]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [43]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [44]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)



In [45]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.28389240167184654
Testing F1 score: 0.2417116060678538


  'precision', 'predicted', average, warn_for)


# Distributed Memory with Averaging

In [46]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2313966.82it/s]


In [47]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 21772/21772 [00:00<00:00, 2536340.04it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2887172.74it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2515519.44it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3194737.85it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3506581.16it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3321030.90it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3034035.04it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3429283.36it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3109028.55it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3207868.29it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3333031.12it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3466777.52it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3233653.92it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3425938.35it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3062628.26it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3295265.11it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3274938.56it/

CPU times: user 1min 2s, sys: 18.3 s, total: 1min 20s
Wall time: 40.4 s


# Train Logistic Regression 

In [48]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.45472082306290856
Testing F1 score: 0.46786233736427385


In [49]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [56]:
!pip install testfixtures

Collecting testfixtures
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1e/d2b91046bc796c009b670b15b567f8580be924c1904efaf5fbf065799874/testfixtures-6.8.2-py2.py3-none-any.whl (85kB)
[K    100% |████████████████████████████████| 92kB 346kB/s ta 0:00:01
[?25hInstalling collected packages: testfixtures
Successfully installed testfixtures-6.8.2


In [57]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [58]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [59]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [60]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.43960990247561893
Testing F1 score: 0.45962329928338486


In [None]:
# df  = pd.DataFrame(tvec.fit_transform(texts).toarray(),
#                    columns=tvec.get_feature_names())
# df.head()

In [3]:
# Doc-2-vec on each row (verse)= each row is a vector DOESN'T HAVE A METRIC 

# what is the probability that each row is connected to each genre 

# genre as a target variable 

# stop words and lemmatize 

# TARGET = lables 

# cosign similarity 

In [4]:
# split in train () and test 

In [None]:
cnt_pro = df['Product'].value_counts()
plt.figure(figsize=(12,4))
sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Product', fontsize=12)
plt.xticks(rotation=90)
plt.show();

In [5]:
# train the model on the dataset 

# knn, classifier
# logistic reg  classifier 
# random forest classifier 
# adaboost classifier 
# naive beys classifier 

# hyper parameter tuening 

# Misclassification rate, f1 score, misclassification 

In [6]:
# then do the same preprocessing thing on the new data, after manually labeling by genre 
# All i need is text and genre 
# features = text 
# target = predicting genre 

In [7]:
# testing under supervised 
# subreddit on genres 

# ORRRR

# testing on another version ( check shape and heads to see punctuation (similar translation or not?))


In [8]:
# blogs before this step 

# scoring on model ? 


# f1 score, sensitivity, sensitivity, misclassification, accuracy 