In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns 

## Import asv 

In [2]:
asv = pd.read_csv('./my_datasets/asv_genre_text.csv')


In [3]:
asv.head()

Unnamed: 0,genre,text
0,1,In the beginning God created the heavens and t...
1,1,And the earth was waste and void; and darkness...
2,1,"And God said, Let there be light: and there wa..."
3,1,"And God saw the light, that it was good: and G..."
4,1,"And God called the light Day, and the darkness..."


# Baseline class percentage 

In [4]:
asv['genre'].value_counts(normalize=True, ascending=True)

8    0.012989
6    0.032376
1    0.049288
7    0.088995
5    0.152333
3    0.181462
4    0.217921
2    0.264637
Name: genre, dtype: float64

In [5]:
X = asv['text']
y = asv['genre']

print(X.shape)
print(y.shape)

(31103,)
(31103,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=51619,
                                                   stratify=y)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23327,), (7776,), (23327,), (7776,))

## Simple TFIDF and Logistic Regression baseline score before parameter tuening 

In [8]:
tf = TfidfVectorizer(stop_words = 'english')


In [9]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ])

In [10]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 



0.6996206136054091

In [11]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [12]:
pipe.score(X_train, y_train)

0.7845415184121405

In [13]:
pipe.score(X_test, y_test) 

0.7164351851851852

# Tfidf parameter tuening 

In [14]:
def scale_model_evaluate(X, y, model_name='lr', 
                         tokenizer_name='tfidf', 
                         tokenizer = TfidfVectorizer(),
                       model_type=LogisticRegression(), 
                       parameters={'tfidf__max_df': [.1, .2],
                                   'tfidf__min_df': [.09, 1],
                                   'tfidf__ngram_range': (1.3, 1),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__penalty': ['l1', 'l2'],
                                   'lr__C': np.logspace(0, 5, 10),
                                   'lr__n_jobs': [1], 
                                  }
                        ):
 
    
    pipe=Pipeline(memory=None,
         steps=[(tokenizer_name,tokenizer),
                (model_name,model_type)])
                         
 
    grid = GridSearchCV(pipe, param_grid=parameters, cv=5)
    grid = grid.fit(X, y) 
    
    
    
    
    print(f"For model: {model_type}")
    print(f"The best parameters are: {grid.best_params_}")
    print(f"The best score is: {grid.best_score_:.2f}")
    return grid

In [15]:
scale_model_evaluate(X_train, y_train, 
                     model_name='lr', 
                     tokenizer_name='tfidf',
                     tokenizer = TfidfVectorizer(),
                     parameters={'tfidf__max_df': [.1, .2],
                                 'tfidf__min_df': [.09, 1],
                                 'tfidf__ngram_range':[(1, 1), (1, 2)],
                                 'lr__penalty': ['l1', 'l2'],
                                 'lr__n_jobs': [1]
                                } )



For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
The best parameters are: {'lr__n_jobs': 1, 'lr__penalty': 'l2', 'tfidf__max_df': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
The best score is: 0.71


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__max_df': [0.1, 0.2], 'tfidf__min_df': [0.09, 1], 'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__penalty': ['l1', 'l2'], 'lr__n_jobs': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

# Multinomial NB and Tfidf 

In [16]:
pipe = Pipeline([
        ('tf', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ])

In [17]:
cross_val_score(pipe, X_train, y_train, cv=5).mean() 

0.6174832997520394

In [18]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
pipe.score(X_train, y_train)

0.6895443048827539

In [20]:
pipe.score(X_test, y_test) 

0.6293724279835391

## Doc-2-vec 

Source : https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [22]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import stopwords 

## Train/test split 

In [26]:
train, test = train_test_split(asv, test_size=0.3, random_state=1619)

In [28]:
text = asv['text']
genre = asv['genre']

##  Tokenize text 

In [29]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.genre]), axis=1)

In [30]:
train_tagged.values[50]

TaggedDocument(words=['and', 'myself', 'will', 'fight', 'against', 'you', 'with', 'an', 'outstretched', 'hand', 'and', 'with', 'strong', 'arm', 'even', 'in', 'anger', 'and', 'in', 'wrath', 'and', 'in', 'great', 'indignation'], tags=[4])

## Assigning multi processing cores 

In [31]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [32]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2421596.04it/s]


In [33]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 21772/21772 [00:00<00:00, 2365454.91it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3110723.08it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3111677.06it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3187155.75it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3277524.47it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3207417.61it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3210687.95it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3220198.42it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3022586.61it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3304327.21it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3270481.58it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3434571.49it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3235487.06it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3311756.97it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3331571.93it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3284243.36it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3240883.94it/

CPU times: user 40.3 s, sys: 8.16 s, total: 48.4 s
Wall time: 24.6 s


# Buliding the final vector feature for the classifier

In [34]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [35]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [36]:
lr = LogisticRegression(n_jobs=1, C=1e5)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [37]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.3101489658128818
Testing F1 score: 0.26045382059050676


  'precision', 'predicted', average, warn_for)


# Distributed Memory with Averaging

In [38]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 21772/21772 [00:00<00:00, 2286045.83it/s]


In [39]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 21772/21772 [00:00<00:00, 2556648.94it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3340346.28it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3083934.57it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3288856.40it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2631123.02it/s]
100%|██████████| 21772/21772 [00:00<00:00, 2452026.92it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3076972.39it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3242379.87it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3388563.09it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3407783.96it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3052289.15it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3266971.48it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3407656.79it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3488097.28it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3528667.52it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3147065.05it/s]
100%|██████████| 21772/21772 [00:00<00:00, 3205503.60it/

CPU times: user 1min 3s, sys: 18 s, total: 1min 21s
Wall time: 42.1 s


# Train Logistic Regression 

In [40]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.4442181974064945
Testing F1 score: 0.45308591707944296


In [41]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [42]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

## Building feature vectors 

In [43]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [44]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [46]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.44314650091094204
Testing F1 score: 0.4572145187788457
