In [759]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
import sklearn.metrics as metrics
import pickle
import warnings
warnings.filterwarnings('ignore')

import catboost as cb
from catboost import CatBoostClassifier

from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score

from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

# df = pd.read_csv('../data/cleaned/final_df.csv', index_col=0)


# GridSearch

In [1644]:
df = pd.read_csv('final_book_df3.csv', index_col=0)

In [1705]:
# df.to_csv('final_book_df3.csv')

In [1698]:
df[df.duplicated(subset=['text'])][50:]

Unnamed: 0,date,info,text,target
482,1850,"['FreedomPamphlet.', 'PRICEONEPENNY.', 'THERIG...",those from whom it proceeds are the masters of...,2
483,1861,"[""Lincoln'sFirstInauguralAddress"", 'March4,186...",by some action not provided for in the instrum...,2
489,1841,"['BARNABYRUDGE', ""ATALEOFTHERIOTSOF'EIGHTY"", '...",edith in a speech in parliament on frequent ex...,2
498,1854,"['HIMALAYANJOURNALS', 'or', 'NOTESOFANATURALIS...",dgeworthia crab apple chameleon and porcupine ...,2
564,1911,"['THETALEOF', 'TIMMYTIPTOES', '[Illustration]'...",goody tiptoes but where is chippy hackee my hu...,4
578,1920,Farcical History of Richard Greenow,t is possible for those who desire it incredib...,4
642,1920,The Death of Lully,he young man returned to his couch under the a...,4
688,1880,The Revolutionary's Handbook,great rulers cannot do codes and religions can...,2
729,1922,The DIamond as big as the ritz,john s first two years there passed pleasantly...,4
955,1985,the handmaids tale,falling in love we said i fell for him we were...,5


In [1708]:
df.shape

(962, 4)

In [1707]:
df = df.drop_duplicates(subset=['text'], keep='first', inplace=False)

In [1709]:
lemmatizer = WordNetLemmatizer()
st = df['text'].tolist()


def clean_text(raw_text):
    raw_text = str(raw_text)
    lower_case = raw_text.lower()
    retokenizer = RegexpTokenizer(r'[a-z]+')
    words = retokenizer.tokenize(lower_case)
    
    return(lemmatizer.lemmatize(" ".join(words)))

num_excerpts = df['text'].size

clean_text_excerpts = []

for i in range(0, num_excerpts):
     clean_text_excerpts.append( clean_text( st[i] ))


df['text'] = clean_text_excerpts

In [1710]:
bin_list = [
    [0, 1670, 1800, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1960, np.inf],
    [0, 1670, 1830, 1890, 1920, 1945, np.inf],
    [0, 1670, 1830, 1890, 1920, 1950, np.inf],
    [0, 1670, 1830, 1890, 1910, 1945, np.inf],
    [0, 1670, 1830, 1890, 1930, 1975, np.inf],
    [0, 1700, 1800, 1870, 1910, 1945, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf],
    [0, 1700, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1975, np.inf],
    [0, 1670, 1830, 1890, 1920, 1975, np.inf],
    [0, 1600, 1700, 1800, 1900, 1950, np.inf],
    [0, 1670, 1830, 1920, 1950, 1990, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf]
]

In [1711]:
cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 3),
                        strip_accents='unicode')

tvec = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 3),
                        encoding='utf-8')

In [1712]:
df['date'] = df['date'].astype(int)

In [1713]:
def make_targets(bin_list, model, vectorizer, df=df):
    for b in bin_list:
        bins = b
        bin_names = range(0, 6)
        df['target'] = pd.cut(df['date'], bins, labels=bin_names)
        df.groupby('target').count()

        #train test split
        x = df['text']
        y = df['target']
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)
        
        #using countvectorizer on the x_train and x_test
        train_data = vectorizer.fit_transform(x_train.apply(lambda x: np.str_(x)))
        test_data = vectorizer.transform(x_test.apply(lambda x: np.str_(x)))
        
        
        #instantiating, fitting, and scoring the model
        model = model
        model.fit(train_data, y_train)
        score = model.score(test_data, y_test)
        print(f' Accuracy of Bin {bins}: {score}')

In [1714]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=cvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.6683937823834197
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.6424870466321243
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.6528497409326425
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.6839378238341969
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.6735751295336787
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.6735751295336787
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.6424870466321243
 Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.689119170984456
 Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.6787564766839378
 Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.6735751295336787
 Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.6683937823834197
 Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.6217616580310881
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1975, inf]: 0.616580310880829
 Accuracy of B

In [1490]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=cvec) 

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.5365853658536586
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.5756097560975609
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.624390243902439
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.5902439024390244
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.5560975609756098
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.5560975609756098
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.6292682926829268
 Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.5853658536585366
 Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.5951219512195122
 Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.5414634146341464
 Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.5609756097560976
 Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.6048780487804878
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1975, inf]: 0.6097560975609756
 Accuracy of 

In [373]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=tvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.7912621359223301
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.7669902912621359
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.7669902912621359
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.7864077669902912
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.7087378640776699
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.7621359223300971
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.7524271844660194
 Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.7524271844660194
 Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.7038834951456311
 Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.7427184466019418
 Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.7669902912621359
 Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.7912621359223301
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1975, inf]: 0.6941747572815534
 Accuracy of

In [374]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=tvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.8543689320388349
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.8349514563106796
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.8155339805825242
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.8398058252427184
 Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.8106796116504854
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.8252427184466019
 Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.8349514563106796
 Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.8058252427184466
 Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.7281553398058253
 Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.8446601941747572
 Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.7912621359223301
 Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.8203883495145631
 Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1975, inf]: 0.7669902912621359
 Accuracy of

In [17]:
# df.to_csv('rebalanced_df.csv')

# Binning & Cleaning

In [1722]:
bins = [0, 1670, 1800, 1870, 1920, 1945, np.inf]
names = [0, 1, 2, 3, 4, 5]

df['target'] = pd.cut(df['date'], bins, labels=names)


df.groupby('target').count()



Unnamed: 0_level_0,date,info,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,147,147,147
1,130,130,130
2,156,156,156
3,201,201,201
4,149,149,149
5,60,60,60
6,119,119,119


In [1723]:
x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)



cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 3),
                        strip_accents='unicode')

x_train = cvec.fit_transform(x_train.apply(lambda x: np.str_(x)))



x_test = cvec.transform(x_test.apply(lambda x: np.str_(x)))


In [512]:
lr = LogisticRegression()

lr_params = {
    'solver': ['lbfgs', 'sag'],
    'class_weight': ['balanced'],
    'multi_class': ['multinomial', 'auto', 'warn'],
    'penalty': ['none', 'l2']
}

gs = GridSearchCV(lr, param_grid=lr_params)
gs.fit(x_train, y_train)
print(gs.best_score_)
gs.best_params_

0.5107212475633528


{'class_weight': 'balanced',
 'multi_class': 'warn',
 'penalty': 'l2',
 'solver': 'lbfgs'}

In [446]:
# df.to_csv('../data/cleaned/final_df.csv')

# Imbalance Learn Techniques

In [1724]:
def rebalance_train_test_logreg(x, y, rebalance_alg, algorithm_name):

    
    # Rebalance train data
    rebalance = rebalance_alg
    x_reb, y_reb = rebalance.fit_sample(x_train, y_train)

    # Train a Logistic Regression model on resampled data
    logreg = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
    logreg.fit(x_reb, y_reb)

    # Generate predictions
    y_pred = logreg.predict(x_test)

    # Print out metrics
    print(f' Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f' Precision Score: {precision_score(y_test, y_pred, average = None)}')
    print(f' Recall Score: {recall_score(y_test, y_pred, average = None)}')

    return y_pred

In [1725]:
rebalance_train_test_logreg(x_train, y_train, SMOTE(), 'SMOTE')

 Accuracy Score: 0.5233160621761658
 Precision Score: [0.65714286 0.59259259 0.4516129  0.75862069 0.44827586 0.15789474
 0.43478261]
 Recall Score: [0.76666667 0.61538462 0.4516129  0.55       0.43333333 0.25
 0.41666667]


array([3, 1, 4, 0, 2, 6, 1, 1, 2, 1, 1, 3, 5, 3, 5, 0, 3, 6, 0, 4, 2, 0,
       0, 2, 4, 3, 5, 0, 0, 0, 0, 3, 4, 6, 3, 5, 1, 4, 2, 3, 3, 1, 1, 2,
       1, 4, 6, 4, 3, 1, 1, 4, 2, 0, 5, 5, 0, 2, 2, 0, 3, 2, 0, 6, 4, 3,
       2, 4, 3, 3, 4, 4, 1, 0, 0, 2, 1, 4, 3, 6, 5, 3, 2, 5, 5, 1, 1, 2,
       6, 1, 0, 6, 0, 0, 1, 1, 6, 5, 3, 6, 4, 6, 5, 0, 0, 2, 2, 4, 4, 3,
       2, 0, 0, 2, 5, 5, 2, 4, 6, 0, 4, 1, 4, 6, 5, 2, 2, 0, 4, 3, 2, 1,
       1, 3, 3, 4, 3, 6, 3, 3, 4, 2, 0, 4, 3, 3, 1, 4, 6, 4, 2, 6, 0, 5,
       6, 6, 0, 1, 2, 0, 1, 2, 3, 1, 4, 0, 5, 0, 2, 6, 6, 2, 0, 2, 1, 6,
       5, 0, 2, 4, 0, 4, 0, 1, 0, 5, 3, 5, 6, 6, 2, 3, 4])

In [1726]:
rebalance_train_test_logreg(x_train, y_train, ADASYN(), 'ADASYN')

ValueError: No samples will be generated with the provided ratio settings.

In [1727]:
rebalance_train_test_logreg(x_train, y_train, BorderlineSMOTE(), 'BorderlineSMOTE')

 Accuracy Score: 0.538860103626943
 Precision Score: [0.62162162 0.57692308 0.61111111 0.75       0.53571429 0.13636364
 0.38461538]
 Recall Score: [0.76666667 0.57692308 0.35483871 0.675      0.5        0.25
 0.41666667]


array([3, 1, 4, 0, 3, 6, 1, 0, 3, 0, 2, 3, 1, 1, 5, 6, 1, 4, 0, 4, 3, 0,
       0, 3, 4, 3, 5, 0, 0, 0, 0, 3, 4, 6, 3, 5, 1, 4, 3, 3, 3, 1, 1, 0,
       1, 4, 6, 4, 4, 1, 1, 4, 2, 4, 5, 5, 6, 2, 2, 2, 3, 3, 0, 0, 0, 4,
       1, 6, 3, 2, 4, 5, 1, 0, 0, 2, 1, 6, 3, 6, 5, 3, 6, 5, 5, 1, 1, 3,
       6, 1, 0, 6, 0, 0, 1, 3, 6, 5, 3, 5, 5, 4, 5, 0, 0, 2, 2, 4, 4, 3,
       2, 0, 3, 2, 5, 5, 4, 0, 6, 0, 4, 1, 2, 2, 5, 4, 3, 6, 4, 3, 4, 1,
       2, 3, 4, 5, 3, 6, 3, 3, 4, 6, 0, 4, 3, 3, 2, 2, 6, 3, 1, 5, 0, 5,
       6, 6, 0, 0, 0, 0, 1, 4, 3, 1, 6, 0, 6, 0, 2, 6, 6, 3, 3, 1, 1, 6,
       5, 0, 2, 4, 0, 4, 0, 1, 0, 6, 3, 5, 5, 6, 0, 3, 4])

In [1728]:
rebalance_train_test_logreg(x_train, y_train, SMOTETomek(), 'SMOTETomek')

 Accuracy Score: 0.5751295336787565
 Precision Score: [0.66666667 0.56666667 0.64       0.72727273 0.61538462 0.15789474
 0.5       ]
 Recall Score: [0.66666667 0.65384615 0.51612903 0.6        0.53333333 0.25
 0.625     ]


array([3, 1, 4, 6, 3, 6, 1, 0, 2, 2, 1, 3, 5, 1, 5, 6, 3, 6, 0, 4, 2, 0,
       5, 3, 4, 3, 5, 0, 0, 0, 0, 3, 4, 6, 3, 6, 1, 4, 2, 3, 3, 6, 1, 0,
       1, 4, 6, 4, 3, 1, 1, 4, 2, 0, 5, 5, 0, 2, 2, 2, 3, 2, 0, 6, 4, 4,
       1, 4, 3, 3, 4, 5, 1, 0, 6, 2, 1, 4, 3, 6, 5, 3, 0, 2, 5, 6, 1, 0,
       6, 1, 0, 6, 0, 0, 1, 3, 6, 5, 3, 2, 6, 6, 5, 0, 0, 2, 2, 4, 4, 3,
       2, 0, 2, 2, 6, 6, 4, 1, 6, 0, 4, 1, 1, 6, 5, 0, 3, 0, 4, 3, 2, 1,
       1, 3, 3, 4, 3, 6, 3, 3, 1, 4, 0, 4, 3, 3, 1, 2, 6, 4, 1, 6, 0, 5,
       6, 6, 1, 1, 2, 1, 1, 3, 3, 2, 0, 5, 5, 5, 4, 4, 5, 2, 3, 1, 2, 6,
       5, 0, 2, 1, 0, 4, 0, 1, 0, 6, 3, 5, 6, 6, 2, 3, 4])