In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
import sklearn.metrics as metrics
import pickle
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score

from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

df = pd.read_csv('../data/processed/balanced_data.csv', index_col=0)

Using TensorFlow backend.


In [2]:
df.target.value_counts()

3    200
2    200
5    181
4    150
1    150
0    150
Name: target, dtype: int64

# GridSearch

In [3]:
df = df.drop(columns=['target'])

In [4]:
df.head()


Unnamed: 0,date,info,text
235,1623,King JOhn,es that i will and wherefore will i do it i to...
242,1528,The book of the Courtier,then the soul freed from vice purged by studie...
238,1611,the tempest,why as i told thee tis a custom with him i th ...
180,1550,Great sonnets,t me not to the marriage of true minds admit i...
197,1600,A midsummer nights drea,overs and madmen have such seething brains suc...


In [5]:
bin_list = [
    [0, 1670, 1800, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1960, np.inf],
    [0, 1670, 1830, 1890, 1920, 1945, np.inf],
    [0, 1670, 1830, 1890, 1920, 1950, np.inf],
    [0, 1670, 1830, 1890, 1910, 1945, np.inf],
    [0, 1670, 1830, 1890, 1930, 1975, np.inf],
    [0, 1700, 1800, 1870, 1910, 1945, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf],
    [0, 1700, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1975, np.inf],
    [0, 1670, 1830, 1890, 1920, 1975, np.inf],
    [0, 1600, 1700, 1800, 1900, 1950, np.inf],
    [0, 1670, 1830, 1920, 1950, 1990, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf]
]

In [6]:
cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 4),
                        strip_accents='unicode')

tvec = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 4),
                        encoding='utf-8')

In [7]:
def make_targets(bin_list, model, vectorizer, df=df):
    for b in bin_list:
        bins = b
        bin_names = range(0, 6)
        df['target'] = pd.cut(df['date'], bins, labels=bin_names)
        df.groupby('target').count()

        #train test split
        x = df['text']
        y = df['target']
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)
        
        #using countvectorizer on the x_train and x_test
        train_data = vectorizer.fit_transform(x_train.apply(lambda x: np.str_(x)))
        test_data = vectorizer.transform(x_test.apply(lambda x: np.str_(x)))
        
        
        #instantiating, fitting, and scoring the model
        model = model
        model.fit(train_data, y_train)
        score = model.score(test_data, y_test)
        print(f' Accuracy of Bin {bins} with {vectorizer}: {score}')

In [None]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=cvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.5652173913043478
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.60386

In [57]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=cvec)

 Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 5), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.5329341317365269
 Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf] with CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 5), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None): 0.53293

In [None]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=tvec)

In [None]:
make_targets(bin_list, model=LogisticRegression(), vectorizer=tvec)

# Binning & Cleaning

In [13]:
bins = [0, 1670, 1800, 1870, 1920, 1945, np.inf]
names = [0, 1, 2, 3, 4, 5]

df['target'] = pd.cut(df['date'], bins, labels=names)


df.groupby('target').count()


Unnamed: 0_level_0,date,info,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,71,71,71
1,83,83,83
2,326,326,326
3,953,953,953
4,97,97,97
5,181,181,181


In [14]:
df.head()

Unnamed: 0,date,info,target,text
0,1868,"['BABBALLADSANDSAVOYSONGS', 'by', 'W.H.GILBERT...",2,"['ces handed round on trays.', '', ' Then cur..."
2,1900,"['BonniePrinceCharlie', 'ATaleofFontenoyandCul...",3,['n fixed in their bed. Of course we have not ...
3,1894,"['ArmsandtheMan', 'byGeorgeBernardShaw', 'INTR...",3,['tiful in his physical impotence but strong i...
5,1902,"['MARCHINGONNIAGARA', 'ORTHESOLDIERBOYSOFTHEOL...",3,"[' of', 'his trading post.""', '', '""I\'m afrai..."
6,1914,"['NIGHTWATCHES', 'byW.W.Jacobs', 'KEEPINGWATCH...",3,"['pper, swelling his chest and looking', ""roun..."


In [15]:
lemmatizer = WordNetLemmatizer()
st = df['text'].tolist()


def clean_text(raw_text):
    raw_text = str(raw_text)
    lower_case = raw_text.lower()
    retokenizer = RegexpTokenizer(r'[a-z]+')
    words = retokenizer.tokenize(lower_case)
    
    return(lemmatizer.lemmatize(" ".join(words)))

num_excerpts = df['text'].size

clean_text_excerpts = []

for i in range(0, num_excerpts):
     clean_text_excerpts.append( clean_text( st[i] ))


df['text'] = clean_text_excerpts

In [17]:
# df.to_csv('finaldf_imbalanced_targets.csv')

# Balancing

In [None]:
count_class_0, count_class_1, count_class_2, count_class_3, count_class_4, count_class_5 = df.target.value_counts()

In [None]:
df.target.value_counts()

In [None]:

count_class_0, count_class_1, count_class_2, count_class_3, count_class_4, count_class_5 = df.target.value_counts()

df_class_0 = df[df['target'] == 3] 
df_class_1 = df[df['target'] == 4] 
df_class_2 = df[df['target'] == 2]
df_class_3 = df[df['target'] == 5]
df_class_4 = df[df['target'] == 1]
df_class_5 = df[df['target'] == 0]

df_class_2 = df_class_2.sample(181, replace=True)
df_class_0 = df_class_0.sample(181, replace=True)
df_class_1 = df_class_1.sample(181, replace=True)
df_class_4 = df_class_4.sample(181, replace=True)
df_class_5 = df_class_5.sample(181, replace=True)



In [None]:



df_text_over = pd.concat([df_class_0, df_class_1, df_class_2, df_class_3, df_class_4, df_class_5])
print(df.target.value_counts())
print(df_text_over.target.value_counts())

df = df_text_over

In [None]:
df.shape

In [None]:
x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)



In [None]:
cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 5),
                        strip_accents='unicode')

train_data_features = cvec.fit_transform(x_train.apply(lambda x: np.str_(x)))



test_data_features = cvec.transform(x_test.apply(lambda x: np.str_(x)))



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.fit(train_data_features, y_train)

In [None]:
knn.score(test_data_features, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix 

y_pred = knn.predict(test_data_features)

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.ensemble import BaggingClassifier

# bag = BaggingClassifier(n_estimators=1000, max_samples=0.9, max_features=0.5, bootstrap=True)
# bag.fit(train_data_features, y_train)
# bag.score(test_data_features, y_test)

# Imbalance Learn Techniques

In [58]:
x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)

cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 5),
                        strip_accents='unicode')

x_train = cvec.fit_transform(x_train.apply(lambda x: np.str_(x)))



x_test = cvec.transform(x_test.apply(lambda x: np.str_(x)))


In [65]:
def rebalance_train_test_logreg(X, y, rebalance_alg, rebalancing_title):

    
    # Rebalance train data
    rebalance = rebalance_alg
    x_reb, y_reb = rebalance.fit_sample(x_train, y_train)

    # Train a Logistic Regression model on resampled data
    logreg = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
    logreg.fit(x_reb, y_reb)

    # Generate predictions
    y_pred = logreg.predict(x_test)

    # Print out metrics
    print(f' Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f' Precision Score: {precision_score(y_test, y_pred, average = None)}')
    print(f' Recall Score: {recall_score(y_test, y_pred, average = None)}')

    return y_pred

In [66]:
rebalance_train_test_logreg(x_train, y, SMOTE(), 'SMOTE')

 Accuracy Score: 0.38922155688622756
 Precision Score: [0.3030303  0.30769231 0.44444444 0.28571429 0.4        0.51515152]
 Recall Score: [0.58823529 0.34782609 0.3902439  0.21052632 0.32258065 0.47222222]


array([5, 0, 2, 5, 2, 0, 4, 1, 0, 1, 2, 1, 4, 4, 0, 4, 1, 1, 5, 4, 0, 2,
       1, 4, 4, 0, 4, 5, 0, 4, 0, 0, 4, 2, 2, 0, 3, 1, 3, 5, 0, 5, 0, 0,
       5, 0, 0, 2, 4, 5, 0, 5, 1, 4, 0, 5, 5, 1, 4, 0, 0, 2, 1, 1, 2, 4,
       1, 2, 5, 1, 0, 0, 1, 5, 3, 5, 1, 4, 0, 2, 5, 2, 3, 0, 4, 3, 2, 5,
       4, 4, 0, 3, 2, 2, 0, 4, 3, 5, 0, 3, 4, 0, 5, 2, 4, 0, 0, 2, 2, 1,
       1, 5, 1, 1, 1, 4, 2, 5, 5, 2, 4, 5, 5, 5, 0, 5, 2, 3, 3, 2, 0, 3,
       0, 1, 4, 2, 2, 2, 1, 2, 5, 2, 5, 2, 5, 2, 2, 1, 2, 5, 2, 0, 4, 3,
       5, 2, 2, 5, 2, 5, 2, 1, 5, 1, 3, 1, 3])

In [None]:
rebalance_train_test_logreg(x_train, y, ADASYN(), 'ADASYN')

In [None]:
rebalance_train_test_logreg(x_train, y, BorderlineSMOTE(), 'BorderlineSMOTE')

In [61]:
rebalance_train_test_logreg(x_train, y, SMOTETomek(), 'SMOTETomek')

 Accuracy Score: 0.38323353293413176
 Precision Score: [0.22580645 0.38461538 0.42424242 0.14285714 0.4        0.57575758]
 Recall Score: [0.41176471 0.43478261 0.34146341 0.10526316 0.38709677 0.52777778]


array([5, 0, 2, 5, 2, 0, 4, 1, 4, 5, 4, 2, 4, 4, 0, 4, 1, 1, 5, 4, 0, 2,
       1, 2, 4, 0, 2, 4, 0, 4, 0, 0, 4, 2, 3, 0, 3, 0, 3, 5, 5, 5, 1, 5,
       5, 1, 0, 2, 3, 5, 2, 5, 5, 0, 3, 5, 0, 1, 4, 4, 0, 2, 2, 1, 3, 4,
       1, 2, 5, 1, 5, 0, 1, 0, 1, 5, 3, 1, 0, 2, 5, 2, 2, 0, 4, 3, 1, 4,
       4, 4, 1, 3, 2, 1, 0, 4, 2, 5, 4, 2, 4, 0, 0, 4, 3, 0, 0, 4, 4, 1,
       1, 0, 4, 0, 1, 4, 2, 3, 5, 2, 4, 5, 0, 5, 5, 5, 2, 3, 3, 4, 5, 2,
       0, 1, 1, 2, 2, 4, 5, 1, 5, 1, 5, 2, 5, 2, 2, 0, 2, 0, 2, 0, 4, 2,
       5, 2, 1, 5, 2, 5, 2, 1, 5, 0, 3, 1, 5])

In [None]:
rebalance_train_test_logreg(x_train, y, SMOTEENN(), 'SMOTEENN')

In [None]:
rebalance_train_test_logreg(x_train, y, RandomUnderSampler(), 'Random')

In [62]:
rebalance_train_test_logreg(x_train, y, CondensedNearestNeighbour(), 'CNN')

 Accuracy Score: 0.4311377245508982
 Precision Score: [0.31428571 0.5        0.41666667 0.5        0.55       0.5       ]
 Recall Score: [0.64705882 0.26086957 0.73170732 0.05263158 0.35483871 0.36111111]


array([0, 5, 2, 0, 2, 0, 4, 1, 2, 2, 2, 2, 2, 4, 0, 4, 2, 5, 2, 4, 0, 2,
       2, 4, 4, 5, 1, 5, 2, 4, 0, 0, 4, 2, 2, 0, 2, 0, 3, 4, 5, 0, 0, 5,
       2, 0, 0, 2, 2, 4, 0, 5, 2, 0, 0, 4, 0, 2, 2, 0, 0, 2, 2, 1, 2, 4,
       0, 2, 2, 2, 0, 0, 2, 5, 2, 2, 2, 1, 0, 2, 5, 1, 2, 0, 4, 2, 2, 5,
       4, 2, 0, 2, 2, 2, 5, 4, 2, 5, 5, 2, 4, 0, 5, 2, 4, 5, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 5, 2, 4, 2, 2, 0, 0, 2, 2, 2, 3, 2, 0, 2,
       0, 1, 4, 2, 2, 2, 5, 1, 1, 2, 5, 2, 5, 1, 2, 5, 2, 2, 2, 5, 4, 2,
       0, 2, 1, 5, 2, 5, 2, 1, 5, 0, 5, 1, 5])

In [None]:
rebalance_train_test_logreg(x_train, y, EditedNearestNeighbours(), 'CNN')

In [None]:
rebalance_train_test_logreg(x_train, y, RepeatedEditedNearestNeighbours(), 'CNN')