In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict,KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
import scipy

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time

GOOGLE_PATH_AFTER_MYDRIVE = 'Data'
GOOGLE_DRIVE_PATH = os.path.join('drive','My Drive', GOOGLE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

# Append the directory path of this notebook to what python easily "sees"
sys.path.append(GOOGLE_DRIVE_PATH)

# Make your current working direct
GOOGLE_DRIVE_PATH

['sample_submission.csv', 'test.csv', 'train.csv', 'submission.csv']


'drive/My Drive/Data'

In [4]:
df = pd.read_csv('drive/My Drive/Data/train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:

X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

y_one_vector = []
for author in authors:
    if author == "EAP":
        y_one_vector.append(0)
    if author == "HPL":
        y_one_vector.append(1)
    if author == "MWS":
        y_one_vector.append(2)

y_one_vector = np.array(y_one_vector)

In [6]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(X)

max_features = 1000000
Vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', ngrams=2)
with tf.device('/device:CPU:0'):
  Vectorizer.adapt(X)

vocab = encoder.get_vocabulary()

In [7]:
def convert_sparce(sparse_tensor):
  
  row  = np.array(sparse_tensor.indices[:,0])
  col  = np.array(sparse_tensor.indices[:,1])
  data = np.array(sparse_tensor.values)
  out = scipy.sparse.coo_matrix((data, (row, col)), shape=(sparse_tensor.shape.as_list()))

  return out

In [8]:
# TRANSFORMER
from tensorflow import keras
from tensorflow.keras import layers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)



    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim, mask_zero=True)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [9]:
class CNN1d(tf.keras.Model):
    def __init__(self, conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder):
        super(CNN1d, self).__init__()

        self.encoder = encoder

        vocab = encoder.get_vocabulary()
        
        self.embedding = tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=64,mask_zero=True)
        

        self.conv1 = tf.keras.layers.Conv1D(filters=conv1_filters,
                            kernel_size=conv1_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.conv2 = tf.keras.layers.Conv1D(filters=conv2_filters,
                            kernel_size=conv2_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.global_pool = tf.keras.layers.GlobalMaxPool1D(keepdims=False)
        self.dense1 = tf.keras.layers.Dense(dense1, activation='relu')
        self.dense2 = tf.keras.layers.Dense(3, activation="softmax")

    def call(self, x, training=False):
        emb = self.encoder(x)
        emb = self.embedding(emb)
        conv1 = self.conv1(emb)
        conv2 = self.conv2(emb)
        z = tf.concat([conv1, conv2], axis=2)
        z = self.global_pool(z)
        z = self.dense1(z)
        z = self.dense2(z)
        return z

In [10]:
def create_cnn(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1):
    model = CNN1d(conv1_filters, conv1_size, conv2_filters, conv2_size, dense1, encoder)
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy']
    )
    return model

def create_ngram():
    model_ngram = tf.keras.Sequential()
    model_ngram.add(Vectorizer)
      
    model_ngram.add(tf.keras.layers.Dense(25, activation='relu'))
    model_ngram.add(tf.keras.layers.Dropout(0.2))
      
    model_ngram.add(tf.keras.layers.Dense(3, activation='softmax'))
      
    model_ngram.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])
    return model_ngram

def create_lstm():
    LSTM = tf.keras.Sequential()
    LSTM.add(encoder)
    LSTM.add(tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=64,mask_zero=True))
      
    LSTM.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,dropout=0.2,return_sequences=True)))
    LSTM.add(tf.keras.layers.GlobalMaxPool1D())

    LSTM.add(tf.keras.layers.Dropout(0.2))
      
    LSTM.add(tf.keras.layers.Dense(3, activation='softmax'))
      
    LSTM.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])
    
    return LSTM

def create_ensemble():
    ensemble = tf.keras.Sequential()
    # for 3 model
    ensemble.add(tf.keras.layers.Dense(36, activation='relu'))
    ensemble.add(tf.keras.layers.Dropout(0.2))

    ensemble.add(tf.keras.layers.Dense(3, activation='softmax'))
    #ensemble.add(tf.keras.layers.InputLayer())

    ensemble.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])

    return ensemble

def create_transformer():
  sequence_length = 100
  max_features = 1000000
  # Token locations
  Vectorizer_transformer = tf.keras.layers.TextVectorization(max_tokens=max_features,output_sequence_length=sequence_length) 
  Vectorizer_transformer.adapt(X)
  vocab = Vectorizer_transformer.get_vocabulary()
  vocab_size = len(vocab)


  embed_dim =32  # Embedding size for each token
  num_heads =2  # Number of attention heads
  ff_dim = 32  # Hidden layer size in feed forward network inside transformer
  maxlen = sequence_length
  dropout_rate = 0.3 # Dropout rate of feed forward network 

  ## Build embedding and transformer
  embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
  transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim,dropout_rate)

  ## Connect Keras Layers
  inputs = tf.keras.Input(shape=(1,), dtype=tf.string) 
  vec = Vectorizer_transformer(inputs)
  x = embedding_layer(vec)
  x = transformer_block(x)
  x = layers.GlobalAveragePooling1D()(x)
  outputs = layers.Dense(3, activation="softmax")(x)

  transformer = keras.Model(inputs=inputs, outputs=outputs) ##Final Model

  transformer.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])
  return transformer

def create_hybrid(conv_filters, conv_size, lstm_units, dense_units):
  model = tf.keras.Sequential([
      encoder,
      tf.keras.layers.Embedding(
          input_dim=len(vocab),
          output_dim=64,
          # Use masking to handle the variable sequence lengths
          mask_zero=True),
      tf.keras.layers.Conv1D(filters=conv_filters,
                              kernel_size=conv_size,
                              padding="same",
                              activation="relu",
                              data_format="channels_last",
                              ),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=False)),
      # tf.keras.layers.GlobalMaxPool1D(keepdims=False),
      tf.keras.layers.Dense(dense_units, activation='relu'),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(3, activation="softmax")
  ])
  model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(1e-3),
            metrics=['accuracy']
  )
  return model

def create_multinb():
  return MultinomialNB(alpha=1.5)


In [11]:
max_features = 1000000
tfidf_vec = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', sparse=True, ngrams=2)

with tf.device('/device:CPU:0'):
  tfidf_vec.adapt(X)

tdidf = tf.keras.Sequential([
    tfidf_vec])

In [12]:
df = pd.DataFrame(columns = ['model', 'average', 'logloss'])
df

Unnamed: 0,model,average,logloss


In [13]:
count_vec = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='count', sparse=True, ngrams=1)
count_vec.adapt(X)
count = tf.keras.Sequential([count_vec])


kf = KFold(n_splits=3)

for train_index, test_index in kf.split(X[:10]):

  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  y_train = y[train_index]
  y_test = y[test_index]


  x_train_sparce = tdidf.predict(X_train)
  x_test_sparce = tdidf.predict(X_test)
  x_train_sparce_count = count.predict(X_train)
  x_test_sparce_count = count.predict(X_test)

  train_tdift_data = convert_sparce(x_train_sparce)
  test_tdift_data = convert_sparce(x_test_sparce)
  train_count_data = convert_sparce(x_train_sparce_count)
  test_count_data = convert_sparce(x_test_sparce_count)



  cnn = create_cnn(128, 6, 128, 5, 128)
  ngram = create_ngram()
  LSTM = create_lstm()
  transformer = create_transformer()
  hybrid = create_hybrid(64, 5, 64, 64)
  multi_nb = create_multinb()
  ensemble = create_ensemble()
  ensemble_with_tdidf = create_ensemble()

  cnn.fit(X_train, y_train, epochs=1 )

  ngram.fit(X_train, y_train, epochs=1, batch_size=64)
  
  LSTM.fit(X_train, y_train, epochs=2, batch_size=64)

  transformer.fit(X_train, y_train, epochs=1)

  hybrid.fit(X_train, y_train, epochs=1)

  multi_nb.fit(train_count_data, np.argmax(y_train, axis =1))
  

  cnn_pred = cnn.predict(X_train)
  ngram_pred = ngram.predict(X_train)
  LSTM_pred = LSTM.predict(X_train)
  transformer_pred = transformer.predict(X_train)
  hybrid_pred = hybrid.predict(X_train)
  multi_nb_pred = multi_nb.predict_proba(train_count_data)


  cnn_pred_test = cnn.predict(X_test)
  ngram_pred_test = ngram.predict(X_test)
  LSTM_pred_test = LSTM.predict(X_test)
  transformer_pred_test = transformer.predict(X_test)
  hybrid_pred_test = hybrid.predict(X_test)
  multi_nb_pred_test = multi_nb.predict_proba(test_count_data)



  X_train_ens = np.hstack([ngram_pred,cnn_pred,LSTM_pred, transformer_pred, hybrid_pred,multi_nb_pred])
  X_test_ens = np.hstack([ngram_pred_test,cnn_pred_test,LSTM_pred_test, transformer_pred_test, hybrid_pred_test,multi_nb_pred_test])


  X_train_final_tensor = tf.sparse.from_dense(X_train_ens)
  X_test_final_tensor = tf.sparse.from_dense(X_test_ens)
  X_train_concat_tensor = tf.sparse.concat(1,[x_train_sparce, X_train_final_tensor])
  X_test_concat_tensor = tf.sparse.concat(1,[x_test_sparce, X_test_final_tensor])

  multi_nb_logloss = log_loss(y_test,multi_nb_pred)
  multi_nb_acc = np.sum(np.argmax(y_test, axis = 1) == multi_nb.predict(test_count_data))/len(np.argmax(y_test, axis = 1))


  ensemble.fit(X_train_ens, y_train, epochs=2, batch_size=128)
  ensemble_with_tdidf.fit(X_train_concat_tensor, y_train, epochs=1, batch_size=256)

  ngram_results = ngram.evaluate(X_test,y_test)
  LSTM_results =LSTM.evaluate(X_test,y_test)
  cnn_results =cnn.evaluate(X_test,y_test)
  transformer_results = transformer.evaluate(X_test, y_test)
  hybrid_results = hybrid.evaluate(X_test, y_test)
  ensemble_results =ensemble.evaluate(X_test_ens,y_test)
  ensemble_with_tdidf_results =ensemble_with_tdidf.evaluate(X_test_concat_tensor,y_test)



  lin_reg = LinearRegression(fit_intercept=False, positive= True)
  lin_reg.fit(X_train_ens, y_train)
  linreg_logloss = log_loss(y_test,lin_reg.predict(X_test_ens))
  linreg_acc = np.sum(np.argmax(y_test, axis = 1) == np.argmax(lin_reg.predict(X_test_ens), axis = 1))/len(np.argmax(y_test, axis = 1))


  lin_reg2 = LinearRegression(fit_intercept=False, positive= True)
  lin_reg2.fit(np.hstack((X_train_ens,ensemble_with_tdidf.predict(X_train_concat_tensor))), y_train)
  linreg2_logloss = log_loss(y_test,lin_reg2.predict(np.hstack((X_test_ens,ensemble_with_tdidf.predict(X_test_concat_tensor)
  ))))
  linreg2_acc = np.sum(np.argmax(y_test, axis = 1) == np.argmax(lin_reg2.predict(np.hstack((X_test_ens,ensemble_with_tdidf.predict(X_test_concat_tensor)
  ))), axis = 1))/len(np.argmax(y_test, axis = 1))


  df_results = pd.DataFrame({"model":['ngram', 'cnn', 'LSTM','transformer','hybrid','multi_nb','ensemble','ensemble_tdidf','lin_reg_1','lin_reg_2'],\
                  "average":[ngram_results[1],cnn_results[1],LSTM_results[1],transformer_results[1],hybrid_results[1],multi_nb_acc,ensemble_results[1],ensemble_with_tdidf_results[1],\
                             linreg_acc, linreg2_acc],\
                  "logloss":[ngram_results[0],cnn_results[0],LSTM_results[0],transformer_results[0],hybrid_results[0],multi_nb_logloss,ensemble_results[0],ensemble_with_tdidf_results[0],\
                             linreg_logloss, linreg2_logloss]})

  df = df.append(df_results)


Epoch 1/2
Epoch 2/2


InvalidArgumentError: ignored

In [None]:
df.groupby('model').describe()
df.iloc[27:,:].groupby('model').describe()

df = df.iloc[27:,:]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))

df.boxplot('average',by = 'model', ax=ax[0])
df.boxplot('logloss',by = 'model', ax=ax[1])


plt.show()

In [None]:
df.groupby('model').describe()


Unnamed: 0_level_0,average,average,average,average,average,average,average,average,logloss,logloss,logloss,logloss,logloss,logloss,logloss,logloss
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
LSTM,3.0,0.823025,0.003713,0.819645,0.821037,0.82243,0.824715,0.827,3.0,0.452325,0.012253,0.439696,0.446407,0.453117,0.45864,0.464163
cnn,3.0,0.807703,0.007148,0.799448,0.805639,0.81183,0.81183,0.81183,3.0,0.484335,0.009327,0.476032,0.479289,0.482545,0.488486,0.494427
ensemble,3.0,0.854385,0.004389,0.849395,0.852754,0.856114,0.85688,0.857646,3.0,0.392223,0.006302,0.385072,0.389853,0.394634,0.395799,0.396964
ensemble_tdidf,3.0,0.85985,0.001299,0.858872,0.859113,0.859353,0.860339,0.861324,3.0,0.365963,0.004685,0.360645,0.364203,0.367761,0.368622,0.369483
hybrid,3.0,0.794014,0.007914,0.78578,0.790239,0.794699,0.798131,0.801563,3.0,0.514053,0.016528,0.504109,0.504513,0.504918,0.519025,0.533131
lin_reg_1,3.0,0.845242,0.002867,0.842936,0.843637,0.844339,0.846396,0.848452,3.0,0.400576,0.002782,0.397578,0.399327,0.401075,0.402075,0.403075
lin_reg_2,3.0,0.860514,0.001319,0.859332,0.859802,0.860273,0.861105,0.861937,3.0,0.36338,0.004285,0.358788,0.361433,0.364078,0.365675,0.367273
ngram,3.0,0.844936,0.003125,0.842476,0.843178,0.843879,0.846166,0.848452,3.0,0.40095,0.002975,0.397576,0.399827,0.402078,0.402637,0.403195
transformer,3.0,0.81092,0.006725,0.803555,0.808013,0.812471,0.814602,0.816733,3.0,0.477939,0.015441,0.463871,0.469678,0.475486,0.484973,0.49446
