In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time
path_to_datafolder = 'C:/Users/mjdom/source/repos/mdst_nlp_2021/data'
print(os.listdir(path_to_datafolder))

['test.csv', 'train.csv']


In [3]:
df = pd.read_csv(path_to_datafolder+'/train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

y_one_vector = []
for author in authors:
    if author == "EAP":
        y_one_vector.append(0)
    if author == "HPL":
        y_one_vector.append(1)
    if author == "MWS":
        y_one_vector.append(2)

y_one_vector = np.array(y_one_vector)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(X)


In [7]:
tf.__version__
vocab = encoder.get_vocabulary()

### CNN Model

In [8]:
class CNN1d(tf.keras.Model):
    def __init__(self, conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder):
        super(CNN1d, self).__init__()

        self.encoder = encoder

        vocab = encoder.get_vocabulary()
        
        self.embedding = tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=64,mask_zero=True)
        

        self.conv1 = tf.keras.layers.Conv1D(filters=conv1_filters,
                            kernel_size=conv1_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.conv2 = tf.keras.layers.Conv1D(filters=conv2_filters,
                            kernel_size=conv2_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.global_pool = tf.keras.layers.GlobalMaxPool1D()
        self.dense1 = tf.keras.layers.Dense(dense1, activation='relu')
        self.dense2 = tf.keras.layers.Dense(3, activation="softmax")

    def call(self, x, training=False):
        emb = self.encoder(x)
        emb = self.embedding(emb)
        conv1 = self.conv1(emb)
        conv2 = self.conv2(emb)
        z = tf.concat([conv1, conv2], axis=2)
        z = self.global_pool(z)
        z = self.dense1(z)
        z = self.dense2(z)
        return z

In [9]:
def create_model(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1):
    model = CNN1d(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder)
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy']
    )
    return model

In [10]:
callbacks = [
          tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
          tf.keras.callbacks.ModelCheckpoint(
            filepath="CNN_weights",
            save_weights_only=True,
            monitor='val_accuracy',
            mode='max',
            save_best_only=True)
]
with tf.device('/device:GPU:0'):
    cnn = create_model(128, 6, 128, 5, 128)
    history = cnn.fit(X_train, y_train, epochs=1,
                      validation_data=(X_test, y_test),
                      validation_steps=30,
                      callbacks=callbacks)



In [11]:
with tf.device('/device:GPU:0'):
    cnn.evaluate(X_test, y_test)



### N-gram model

In [12]:
max_features = 1000000
Vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_features, output_mode='tf-idf', ngrams=2)
with tf.device('/device:CPU:0'):
    Vectorizer.adapt(X)
vocab = Vectorizer.get_vocabulary()

In [13]:
model_ngram = tf.keras.Sequential()
model_ngram.add(Vectorizer)
   
model_ngram.add(tf.keras.layers.Dense(25, activation='relu'))
model_ngram.add(tf.keras.layers.Dropout(0.2))
   
model_ngram.add(tf.keras.layers.Dense(3, activation='softmax'))
   
model_ngram.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])

In [14]:
with tf.device('/device:GPU:0'):

    model_ngram.fit(X_train, y_train, epochs=1, batch_size=64,
             validation_data=(X_test,y_test),
             validation_steps=10)

pred = model_ngram.predict(df['text'])



### LSTM model

In [15]:
LSTM = tf.keras.Sequential()
LSTM.add(encoder)
LSTM.add(tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=64,mask_zero=True))

LSTM.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,dropout=0.2,return_sequences=True)))

LSTM.add(tf.keras.layers.GlobalMaxPool1D())

LSTM.add(tf.keras.layers.Dropout(0.2))
   
LSTM.add(tf.keras.layers.Dense(3, activation='softmax'))
   
LSTM.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])


In [16]:
with tf.device('/device:GPU:0'):

    LSTM.fit(X_train, y_train, epochs=2, batch_size=64,
             validation_data=(X_test,y_test),
             validation_steps=10)

Epoch 1/2
Epoch 2/2


In [17]:
LSTM.evaluate(X_test, y_test)



[0.43835070729255676, 0.8296731114387512]

### Collect predictions

In [18]:
cnn_pred = cnn.predict(df['text'])
ngram_pred = model_ngram.predict(df['text'])
LSTM_pred = LSTM.predict(df['text'])

In [19]:
ensemble_df = pd.DataFrame({
    "id": df["id"],
    "EAP_ngram": ngram_pred[:, 0],
    "HPL_ngram": ngram_pred[:, 1],
    "MWS_ngram": ngram_pred[:, 2],
    "EAP_cnn": cnn_pred[:, 0],
    "HPL_cnn": cnn_pred[:, 1],
    "MWS_cnn": cnn_pred[:, 2],
    "EAP_lstm": LSTM_pred[:, 0],
    "HPL_lstm": LSTM_pred[:, 1],
    "MWS_lstm": LSTM_pred[:, 2],

    "actual_author": df['author']
})
ensemble_df

Unnamed: 0,id,EAP_ngram,HPL_ngram,MWS_ngram,EAP_cnn,HPL_cnn,MWS_cnn,EAP_lstm,HPL_lstm,MWS_lstm,actual_author
0,id26305,0.995004,0.001591,0.003404,0.992398,0.000624,0.006978,0.973642,0.022725,0.003634,EAP
1,id17569,0.025892,0.951124,0.022984,0.658131,0.248983,0.092886,0.041964,0.946103,0.011933,HPL
2,id11008,0.999130,0.000017,0.000852,0.983048,0.013373,0.003579,0.947245,0.050048,0.002706,EAP
3,id27763,0.000770,0.000140,0.999090,0.002356,0.000979,0.996664,0.001704,0.000341,0.997956,MWS
4,id12958,0.000006,0.999993,0.000001,0.619594,0.100870,0.279536,0.047649,0.936279,0.016072,HPL
...,...,...,...,...,...,...,...,...,...,...,...
19574,id17718,0.948939,0.024967,0.026094,0.827273,0.074793,0.097934,0.902995,0.071810,0.025195,EAP
19575,id08973,0.945483,0.013374,0.041143,0.914230,0.007256,0.078513,0.888047,0.013352,0.098602,EAP
19576,id05267,0.993636,0.001886,0.004478,0.987036,0.006523,0.006441,0.991580,0.003177,0.005243,EAP
19577,id17513,0.265845,0.666056,0.068099,0.469771,0.188130,0.342100,0.500659,0.319762,0.179580,EAP


In [20]:
X_final = ensemble_df[['EAP_ngram', 'HPL_ngram', 'MWS_ngram', \
                       'EAP_cnn', 'HPL_cnn','MWS_cnn', \
                       'EAP_lstm', 'HPL_lstm', 'MWS_lstm']].copy()
#, 'EAP_cnn', 'HPL_cnn', 'MWS_cnn'
authors_final = ensemble_df["actual_author"].copy()

# Label data
y_final = []
for author in authors_final:
    if author == "EAP":
        y_final.append([1, 0, 0])
    if author == "HPL":
        y_final.append([0, 1, 0])
    if author == "MWS":
        y_final.append([0, 0, 1])

y_final = np.array(y_final)
X_final

Unnamed: 0,EAP_ngram,HPL_ngram,MWS_ngram,EAP_cnn,HPL_cnn,MWS_cnn,EAP_lstm,HPL_lstm,MWS_lstm
0,0.995004,0.001591,0.003404,0.992398,0.000624,0.006978,0.973642,0.022725,0.003634
1,0.025892,0.951124,0.022984,0.658131,0.248983,0.092886,0.041964,0.946103,0.011933
2,0.999130,0.000017,0.000852,0.983048,0.013373,0.003579,0.947245,0.050048,0.002706
3,0.000770,0.000140,0.999090,0.002356,0.000979,0.996664,0.001704,0.000341,0.997956
4,0.000006,0.999993,0.000001,0.619594,0.100870,0.279536,0.047649,0.936279,0.016072
...,...,...,...,...,...,...,...,...,...
19574,0.948939,0.024967,0.026094,0.827273,0.074793,0.097934,0.902995,0.071810,0.025195
19575,0.945483,0.013374,0.041143,0.914230,0.007256,0.078513,0.888047,0.013352,0.098602
19576,0.993636,0.001886,0.004478,0.987036,0.006523,0.006441,0.991580,0.003177,0.005243
19577,0.265845,0.666056,0.068099,0.469771,0.188130,0.342100,0.500659,0.319762,0.179580


In [21]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final, y_final, test_size=0.2, random_state=42)


### Ensemble (mlp)

In [22]:
ensemble = tf.keras.Sequential()
# for 3 model
ensemble.add(tf.keras.layers.Dense(36, activation='relu'))
ensemble.add(tf.keras.layers.Dropout(0.2))

ensemble.add(tf.keras.layers.Dense(3, activation='softmax'))

ensemble.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])


pass_same = tf.keras.Sequential()
pass_same.add(tf.keras.layers.InputLayer())
pass_same.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])

In [23]:
ensemble.fit(X_train_final, y_train_final, epochs=2, batch_size=128,
         validation_data=(X_test_final,y_test_final),
         validation_steps=10)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x230760c51c0>

In [24]:
ensemble.evaluate(X_test_final,y_test_final)
model_ngram.evaluate(X_test,y_test)



[0.39023157954216003, 0.8539325594902039]

In [25]:
ngram_results = model_ngram.evaluate(X_test,y_test)
LSTM_results =LSTM.evaluate(X_test,y_test)
cnn_results =cnn.evaluate(X_test,y_test)

ensemble_results =ensemble.evaluate(X_test_final,y_test_final)

df_results = pd.DataFrame({"ngram":ngram_results,"cnn":cnn_results,\
                           "LSTM":LSTM_results,"ensemble":ensemble_results})



In [26]:
df_results

Unnamed: 0,ngram,cnn,LSTM,ensemble
0,0.390232,0.478633,0.438351,0.380303
1,0.853933,0.808989,0.829673,0.861338


### Average of level one classifiers

In [27]:
all_ave = (cnn_pred+ngram_pred +LSTM_pred )/3
X_train_ave, X_test_ave, y_train_ave, y_test_ave = train_test_split(all_ave, y_final, test_size=0.2, random_state=42)



pass_same.evaluate(X_test_ave,y_test_final)



[0.3813556134700775, 0.8539325594902039]

In [28]:
df_results['raw_ave'] = pass_same.evaluate(X_test_ave,y_test_final)
df_results



Unnamed: 0,ngram,cnn,LSTM,ensemble,raw_ave
0,0.390232,0.478633,0.438351,0.380303,0.381356
1,0.853933,0.808989,0.829673,0.861338,0.853933


### Linear Regression of predictions

In [29]:
from sklearn.linear_model import LinearRegression


In [30]:
lin_reg = LinearRegression(fit_intercept=False, positive= True).fit(X_train_final, y_train_final)


In [31]:
print(lin_reg.score(X_test_final, y_test_final))


0.6717982644468995


In [32]:
lin_reg.predict(X_test_final)


array([[1.03342784e+00, 2.72527873e-03, 3.35339277e-03],
       [3.36418228e-01, 1.34634396e-01, 5.52629422e-01],
       [1.94467509e-01, 3.57983076e-01, 4.67787955e-01],
       ...,
       [6.36424985e-04, 1.01052731e+00, 2.40648124e-04],
       [2.14953871e-02, 9.82143349e-01, 9.24399727e-03],
       [9.85691314e-01, 4.25081793e-02, 5.14714167e-03]])

In [33]:
pass_same.evaluate(lin_reg.predict(X_test_final), y_test_final)



[0.38899120688438416, 0.8539325594902039]

In [34]:
df_results['Linear_reg'] =pass_same.evaluate(lin_reg.predict(X_test_final), y_test_final)
df_results



Unnamed: 0,ngram,cnn,LSTM,ensemble,raw_ave,Linear_reg
0,0.390232,0.478633,0.438351,0.380303,0.381356,0.388991
1,0.853933,0.808989,0.829673,0.861338,0.853933,0.853933


### EDA on Level one predictions:

In [35]:
ngram_wrong = np.argmax(X_test_final.values[:,:3], axis = 1) != np.argmax(y_test_final, axis = 1)
ind_ngram = np.argwhere(ngram_wrong).reshape(len(np.argwhere(ngram_wrong)),)
cnn_wrong = np.argmax(X_test_final.values[:,3:6], axis = 1) != np.argmax(y_test_final, axis = 1)
ind_cnn = np.argwhere(cnn_wrong).reshape(len(np.argwhere(cnn_wrong)),)
lstm_wrong = np.argmax(X_test_final.values[:,6:9], axis = 1) != np.argmax(y_test_final, axis = 1)
ind_lstm = np.argwhere(lstm_wrong).reshape(len(np.argwhere(lstm_wrong)),)



all_wrong = set(ind_cnn).intersection(set(ind_ngram),set(ind_lstm))
cnn_lstm_wrong = set(ind_cnn).intersection(set(ind_lstm))
ngram_lstm_wrong = set(ind_ngram).intersection(set(ind_lstm))
cnn_ngram_wrong = set(ind_cnn).intersection(set(ind_ngram))


print("number all wrong", len(all_wrong))
print("number ngram wrong",len(ind_ngram))

print("number cnn wrong",len(ind_cnn))
print("number lstm wrong",len(ind_lstm))
print('')
print('')

print('ngram right, cnn/lstm wrong',len(cnn_lstm_wrong.difference(set(ind_ngram))))
print('cnn right, ngram/lstm wrong',len(ngram_lstm_wrong.difference(set(ind_cnn))))
print('lstm right, cnn/ngram wrong',len(cnn_ngram_wrong.difference(set(ind_lstm))))
print('')
print('')

print('ngram/cnn right, lstm wrong',len(set(ind_lstm).difference(set(ind_ngram).union(set(ind_cnn)))))
print('cnn/lstm right, ngram wrong',len(set(ind_ngram).difference(set(ind_lstm).union(set(ind_cnn)))))
print('ngram/lstm right, cnn wrong',len(set(ind_cnn).difference(set(ind_ngram).union(set(ind_lstm)))))

print(len(all_wrong)/len(y_test_final))

number all wrong 301
number ngram wrong 572
number cnn wrong 749
number lstm wrong 667


ngram right, cnn/lstm wrong 164
cnn right, ngram/lstm wrong 57
lstm right, cnn/ngram wrong 64


ngram/cnn right, lstm wrong 145
cnn/lstm right, ngram wrong 150
ngram/lstm right, cnn wrong 220
0.07686414708886619


In [36]:
X_test.iloc[list(all_wrong)]

10639                               No delay was to ensue.
9504     She had spoken also of the Black Man, of her o...
14161    There are surely other worlds than this other ...
11277    His chief amusements were gunning and fishing,...
19231    I was not so sanguine as she as to the result ...
                               ...                        
1227     Suddenly these manifestations they ceased, and...
10509    Close by is a tomb, once beautiful with the su...
2325     I had gazed with wonder, mixed with fear and e...
8658     He evidently feared the physical effect of vio...
1664     Were not the mightiest men of the olden times ...
Name: text, Length: 301, dtype: object