In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
import sklearn.metrics as metrics
import pickle
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score

from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

df = pd.read_csv('../data/cleaned/final_df.csv', index_col=0)

Using TensorFlow backend.


In [2]:
df.target.value_counts()

3    281
4    271
5    179
2    128
1    103
0     69
Name: target, dtype: int64

# GridSearch

In [3]:
df = df.drop(columns=['target'])

In [4]:
df.tail()


Unnamed: 0,date,info,text
49,1998,The Hours,and here she is herself clarissa not mrs dallo...
56,1985,Perfume: The Story of a Murderer,when they finally did dare it at first with st...
59,1999,White Teeth,archie for one watched the mouse he watched it...
61,2002,Any Human Heart,my personal rollercoaster not so much a roller...
63,2008,The Outcast,he didn t think about it he went straight to a...


In [5]:
bin_list = [
    [0, 1670, 1800, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1960, np.inf],
    [0, 1670, 1830, 1890, 1920, 1945, np.inf],
    [0, 1670, 1830, 1890, 1920, 1950, np.inf],
    [0, 1670, 1830, 1890, 1910, 1945, np.inf],
    [0, 1670, 1830, 1890, 1930, 1975, np.inf],
    [0, 1700, 1800, 1870, 1910, 1945, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf],
    [0, 1700, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1975, np.inf],
    [0, 1670, 1830, 1890, 1920, 1975, np.inf],
    [0, 1600, 1700, 1800, 1900, 1950, np.inf],
    [0, 1670, 1830, 1920, 1950, 1990, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf]
]

In [6]:
cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 4),
                        strip_accents='unicode')

tvec = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 4),
                        encoding='utf-8')

In [7]:
def make_targets(bin_list, model, vectorizer, df=df):
    for b in bin_list:
        bins = b
        bin_names = range(0, 6)
        df['target'] = pd.cut(df['date'], bins, labels=bin_names)
        df.groupby('target').count()

        #train test split
        x = df['text']
        y = df['target']
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)
        
        #using countvectorizer on the x_train and x_test
        train_data = vectorizer.fit_transform(x_train.apply(lambda x: np.str_(x)))
        test_data = vectorizer.transform(x_test.apply(lambda x: np.str_(x)))
        
        
        #instantiating, fitting, and scoring the model
        model = model
        model.fit(train_data, y_train)
        score = model.score(test_data, y_test)
        print(f' Accuracy of Bin {bins} with {vectorizer}: {score}')

In [None]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=cvec) 

In [8]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=cvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.8502415458937198
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.82125

In [None]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=tvec)


In [9]:
make_targets(bin_list, model=LogisticRegression(), vectorizer=tvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf] with TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None): 0.8260869565217391
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf] with TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accent

 Accuracy of Bin [0, 1670, 1830, 1920, 1950, 1990, inf] with TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None): 0.6859903381642513
 Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf] with TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accent

# Binning & Cleaning

In [11]:
bins = [0, 1670, 1830, 1870, 1910, 1945, np.inf]
names = [0, 1, 2, 3, 4, 5]

df['target'] = pd.cut(df['date'], bins, labels=names)


df.groupby('target').count()


Unnamed: 0_level_0,date,info,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,150,150,150
1,188,188,188
2,162,162,159
3,145,145,145
4,205,205,205
5,181,181,181


In [12]:
df.head()

Unnamed: 0,date,info,text,target
242,1528,The book of the Courtier,then the soul freed from vice purged by studie...,0
188,1569,Planine,his goodly frame the earth seems to me a steri...,0
189,1603,Hamlet,f one lives where all suffer and starve one ac...,0
188,1569,Planine,his goodly frame the earth seems to me a steri...,0
237,1623,macbeth,mine eyes are made the fools o the other sense...,0


In [13]:
lemmatizer = WordNetLemmatizer()
st = df['text'].tolist()


def clean_text(raw_text):
    raw_text = str(raw_text)
    lower_case = raw_text.lower()
    retokenizer = RegexpTokenizer(r'[a-z]+')
    words = retokenizer.tokenize(lower_case)
    
    return(lemmatizer.lemmatize(" ".join(words)))

num_excerpts = df['text'].size

clean_text_excerpts = []

for i in range(0, num_excerpts):
     clean_text_excerpts.append( clean_text( st[i] ))


df['text'] = clean_text_excerpts

In [14]:
df.to_csv('../data/cleaned/final_df.csv')

# Imbalance Learn Techniques

In [None]:
x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)

cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 5),
                        strip_accents='unicode')

x_train = cvec.fit_transform(x_train.apply(lambda x: np.str_(x)))



x_test = cvec.transform(x_test.apply(lambda x: np.str_(x)))


In [None]:
def rebalance_train_test_logreg(X, y, rebalance_alg, rebalancing_title):

    
    # Rebalance train data
    rebalance = rebalance_alg
    x_reb, y_reb = rebalance.fit_sample(x_train, y_train)

    # Train a Logistic Regression model on resampled data
    logreg = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
    logreg.fit(x_reb, y_reb)

    # Generate predictions
    y_pred = logreg.predict(x_test)

    # Print out metrics
    print(f' Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f' Precision Score: {precision_score(y_test, y_pred, average = None)}')
    print(f' Recall Score: {recall_score(y_test, y_pred, average = None)}')

    return y_pred

In [None]:
rebalance_train_test_logreg(x_train, y, SMOTE(), 'SMOTE')

In [None]:
rebalance_train_test_logreg(x_train, y, ADASYN(), 'ADASYN')

In [None]:
rebalance_train_test_logreg(x_train, y, BorderlineSMOTE(), 'BorderlineSMOTE')

In [None]:
rebalance_train_test_logreg(x_train, y, SMOTETomek(), 'SMOTETomek')

In [None]:
rebalance_train_test_logreg(x_train, y, SMOTEENN(), 'SMOTEENN')

In [None]:
rebalance_train_test_logreg(x_train, y, RandomUnderSampler(), 'Random')

In [None]:
rebalance_train_test_logreg(x_train, y, CondensedNearestNeighbour(), 'CNN')

In [None]:
rebalance_train_test_logreg(x_train, y, EditedNearestNeighbours(), 'CNN')

In [None]:
rebalance_train_test_logreg(x_train, y, RepeatedEditedNearestNeighbours(), 'CNN')