In [43]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [44]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [45]:
X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(X_train)

In [48]:
class CNN1d(tf.keras.Model):
    def __init__(self, conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder):
        super(CNN1d, self).__init__()

        self.encoder = encoder

        vocab = encoder.get_vocabulary()
        
        self.embedding = tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=64,mask_zero=True)
        

        self.conv1 = tf.keras.layers.Conv1D(filters=conv1_filters,
                            kernel_size=conv1_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.conv2 = tf.keras.layers.Conv1D(filters=conv2_filters,
                            kernel_size=conv2_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.global_pool = tf.keras.layers.GlobalMaxPool1D(keepdims=False)
        self.dense1 = tf.keras.layers.Dense(dense1, activation='relu')
        self.dense2 = tf.keras.layers.Dense(3, activation="softmax")

    def call(self, x, training=False):
        emb = self.encoder(x)
        emb = self.embedding(emb)
        conv1 = self.conv1(emb)
        conv2 = self.conv2(emb)
        z = tf.concat([conv1, conv2], axis=2)
        z = self.global_pool(z)
        z = self.dense1(z)
        z = self.dense2(z)
        return z

In [49]:
def create_model(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1):
    model = CNN1d(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder)
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy']
    )
    return model

In [50]:
callbacks = [
          tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
          tf.keras.callbacks.ModelCheckpoint(
            filepath="CNN_weights",
            save_weights_only=True,
            monitor='val_accuracy',
            mode='max',
            save_best_only=True)
]
with tf.device('/device:GPU:0'):
  cnn = create_model(128, 6, 128, 5, 128)
  history = cnn.fit(X_train, y_train, epochs=15,
                      validation_data=(X_test, y_test),
                      validation_steps=30,
                      callbacks=callbacks)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


In [51]:
max_features = 1000000
Vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', ngrams=2)
with tf.device('/device:CPU:0'):
  Vectorizer.adapt(X)
vocab = Vectorizer.get_vocabulary()

In [52]:
model = tf.keras.Sequential()
model.add(Vectorizer)
   
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
   
model.add(tf.keras.layers.Dense(3, activation='softmax'))
   
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])

In [53]:
model.fit(X_train, y_train, epochs=1, batch_size=64,
         validation_data=(X_test,y_test),
         validation_steps=10)

pred = model.predict(df['text'])



In [54]:
cnn_pred = cnn.predict(df['text'])
ngram_pred = model.predict(df['text'])

In [57]:
ensemble_df = pd.DataFrame({
    "id": df["id"],
    "EAP_ngram": ngram_pred[:, 0],
    "HPL_ngram": ngram_pred[:, 1],
    "MWS_ngram": ngram_pred[:, 2],
    "EAP_cnn": cnn_pred[:, 0],
    "HPL_cnn": cnn_pred[:, 1],
    "MWS_cnn": cnn_pred[:, 2],
    "actual_author": df['author']
})
ensemble_df

Unnamed: 0,id,EAP_ngram,HPL_ngram,MWS_ngram,EAP_cnn,HPL_cnn,MWS_cnn,actual_author
0,id26305,0.997641,0.001860,0.000499,9.999982e-01,1.628095e-06,1.062047e-07,EAP
1,id17569,0.080363,0.888638,0.030999,2.242347e-04,9.995772e-01,1.985183e-04,HPL
2,id11008,0.998898,0.001085,0.000017,9.998996e-01,1.003015e-04,1.073481e-07,EAP
3,id27763,0.002344,0.001729,0.995927,1.198767e-08,1.155771e-08,1.000000e+00,MWS
4,id12958,0.001371,0.997999,0.000630,3.885653e-05,9.999548e-01,6.358903e-06,HPL
...,...,...,...,...,...,...,...,...
19574,id17718,0.986627,0.010192,0.003181,9.999247e-01,7.504811e-05,2.222216e-07,EAP
19575,id08973,0.922833,0.053497,0.023670,9.999982e-01,4.381722e-08,1.829797e-06,EAP
19576,id05267,0.999413,0.000335,0.000251,9.999993e-01,2.906222e-07,5.317214e-07,EAP
19577,id17513,0.450641,0.440184,0.109175,9.929609e-01,5.807552e-03,1.231570e-03,EAP


In [74]:
X_final = ensemble_df[['EAP_ngram', 'EAP_ngram', 'EAP_ngram']].copy()
#, 'EAP_cnn', 'HPL_cnn', 'MWS_cnn'
authors_final = ensemble_df["actual_author"].copy()

# Label data
y_final = []
for author in authors_final:
    if author == "EAP":
        y_final.append([1, 0, 0])
    if author == "HPL":
        y_final.append([0, 1, 0])
    if author == "MWS":
        y_final.append([0, 0, 1])

y_final = np.array(y_final)

In [75]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [76]:
ensemble = tf.keras.Sequential()
   
ensemble.add(tf.keras.layers.Dense(25, activation='relu'))
ensemble.add(tf.keras.layers.Dropout(0.2))
   
ensemble.add(tf.keras.layers.Dense(3, activation='softmax'))
   
ensemble.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-3),
             metrics=['accuracy'])

In [83]:
ensemble.fit(X_train_final, y_train_final, epochs=1, batch_size=64,
         validation_data=(X_test_final,y_test_final),
         validation_steps=10)



<keras.callbacks.History at 0x7f58fc901e50>