In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
bbe = pd.read_csv('./my_datasets/bbe_genre_text.csv')

In [3]:
bbe.head()

Unnamed: 0,genre,text
0,1,At the first God made the heaven and the earth.
1,1,And the earth was waste and without form; and ...
2,1,"And God said, Let there be light: and there wa..."
3,1,"And God, looking on the light, saw that it was..."
4,1,"Naming the light, Day, and the dark, Night. An..."


# Baseline class percentage 

In [4]:
bbe['genre'].value_counts(normalize=True, ascending=True)

8    0.012989
6    0.032376
1    0.049288
7    0.088995
5    0.152333
3    0.181462
4    0.217921
2    0.264637
Name: genre, dtype: float64

# Train/test split 

In [5]:

X = bbe['text']
y = bbe['genre']

print(X.shape)
print(y.shape)

(31103,)
(31103,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=51419,
                                                   stratify=y)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23327,), (7776,), (23327,), (7776,))

## Simple TFIDF and Logistic Regression baseline score before parameter tuening 

In [8]:
tf = TfidfVectorizer(stop_words = 'english')


In [9]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ])

In [10]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 



0.6681113449139917

In [11]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [12]:
pipe.score(X_train, y_train)

0.7354138980580444

In [13]:
pipe.score(X_test, y_test) 

0.6716820987654321

# Tfidf parameter tuening 

In [14]:
def scale_model_evaluate(X, y, model_name='lr', 
                         tokenizer_name='tfidf', 
                         tokenizer = TfidfVectorizer(),
                       model_type=LogisticRegression(), 
                       parameters={'tfidf__max_df': [.1, .2],
                                   'tfidf__min_df': [.09, 1],
                                   'tfidf__ngram_range': (1.3, 1),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__n_jobs': [1], 
                                  }
                        ):
 
    
    pipe=Pipeline(memory=None,
         steps=[(tokenizer_name,tokenizer),
                (model_name,model_type)])
                         
 
    grid = GridSearchCV(pipe, param_grid=parameters, cv=5)
    grid = grid.fit(X, y) 
    
    
    
    
    print(f"For model: {model_type}")
    print(f"The best parameters are: {grid.best_params_}")
    print(f"The best score is: {grid.best_score_:.2f}")
    return grid


In [15]:
scale_model_evaluate(X_train, y_train, 
                     model_name='lr', 
                     tokenizer_name='tfidf',
                     tokenizer = TfidfVectorizer(),
                     parameters={'tfidf__max_df': [.1, .2],
                                 'tfidf__min_df': [.09, 1],
                                 'tfidf__ngram_range':[(1, 1), (1, 2)],
                                 'lr__penalty': ['l1', 'l2'],
                                 'lr__n_jobs': [1]
                                } )



For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
The best parameters are: {'lr__n_jobs': 1, 'lr__penalty': 'l2', 'tfidf__max_df': 0.2, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
The best score is: 0.68


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__max_df': [0.1, 0.2], 'tfidf__min_df': [0.09, 1], 'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__penalty': ['l1', 'l2'], 'lr__n_jobs': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

# Multinomial NB and Tfidf 

In [16]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ])

In [17]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 

0.5999072065370052

In [18]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
pipe.score(X_train, y_train)

0.6492047841556994

In [20]:
pipe.score(X_test, y_test) 

0.6006944444444444

## Doc-2-vec 

Source : https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [21]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import stopwords 

In [23]:
bbe.head()

Unnamed: 0,genre,text
0,1,At the first God made the heaven and the earth.
1,1,And the earth was waste and without form; and ...
2,1,"And God said, Let there be light: and there wa..."
3,1,"And God, looking on the light, saw that it was..."
4,1,"Naming the light, Day, and the dark, Night. An..."


## Train/test split 

In [26]:
train, test = train_test_split(bbe, test_size=0.3, random_state=1619)

In [27]:
text = bbe['text']
genre = bbe['genre']

##  Tokenize text 

In [28]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)

In [29]:
train_tagged.values[200]

TaggedDocument(words=['for', 'the', 'lord', 'has', 'pleasure', 'in', 'his', 'people', 'he', 'gives', 'the', 'poor', 'in', 'spirit', 'crown', 'of', 'salvation'], tags=[3])

## Assigning multi processing cores 

In [30]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [31]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2574451.99it/s]


In [32]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 21772/21772 [00:00<00:00, 1921157.65it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2892476.85it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3062628.26it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2995125.67it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3197086.67it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3276936.40it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3172760.29it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3236519.11it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2990515.68it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3407148.22it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3351625.44it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3386552.45it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3236060.34it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3362981.02it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3364591.82it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3359269.67it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3490897.46it/

CPU times: user 40.4 s, sys: 8.49 s, total: 48.9 s
Wall time: 24.9 s


## Buliding the final vector feature for the classifier 

In [33]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors 

In [34]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [35]:
lr = LogisticRegression(n_jobs=1, C=1e5)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [36]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.27081770442610653
Testing F1 score: 0.2199671753809871


# Distributed Memory with Averaging

In [37]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2552361.41it/s]


In [38]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 21772/21772 [00:00<00:00, 2120625.76it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3301221.41it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2958830.53it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3248030.83it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3267088.36it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3493167.57it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3363352.61it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3411093.60it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3271067.33it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3387054.88it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3305284.01it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3319340.87it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3340835.10it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2991887.38it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2953280.51it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2989047.39it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3445066.84it/

CPU times: user 1min 3s, sys: 19.3 s, total: 1min 22s
Wall time: 43.8 s


# Train Logistic Regression 

In [39]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.4847283249383775
Testing F1 score: 0.4883086950347334


In [40]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [41]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

## Building feature vectors 

In [42]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [43]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [44]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.4589004393955632
Testing F1 score: 0.46462330028460125
