In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict,KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC,SVC
import scipy
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.naive_bayes import ComplementNB

from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords

In [None]:
'''from google.colab import drive
drive.mount('/content/drive')'''


In [None]:
'''import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time

GOOGLE_PATH_AFTER_MYDRIVE = 'NLP_Textcat/spooky_data/train'
GOOGLE_DRIVE_PATH = os.path.join('drive','My Drive', GOOGLE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

# Append the directory path of this notebook to what python easily "sees"
sys.path.append(GOOGLE_DRIVE_PATH)

# Make your current working direct
GOOGLE_DRIVE_PATH'''

In [None]:
import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time
path_to_datafolder = 'C:/Users/mjdom/source/repos/mdst_nlp_2021/data'
print(os.listdir(path_to_datafolder))

In [None]:

df = pd.read_csv(path_to_datafolder+ '/train.csv')
df_kaggle = pd.read_csv(path_to_datafolder + '/test.csv')
df_kaggle.head()



In [None]:

X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

y_one_vector = []
for author in authors:
    if author == "EAP":
        y_one_vector.append(0)
    if author == "HPL":
        y_one_vector.append(1)
    if author == "MWS":
        y_one_vector.append(2)

y_one_vector = np.array(y_one_vector)

In [None]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(np.hstack((X,df_kaggle['text'])))

max_features = 1000000
Vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', ngrams=3)
count_vec = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='count', sparse=True, ngrams=1)

with tf.device('/device:CPU:0'):
    Vectorizer.adapt(np.hstack((X,df_kaggle['text'])))
    count_vec.adapt(np.hstack((X,df_kaggle['text'])))

vocab = encoder.get_vocabulary()
len(vocab)


In [None]:
class CNN1d(tf.keras.Model):
    def __init__(self, conv1_filters, conv1_size, conv2_filters, conv2_size, encoder):
        super(CNN1d, self).__init__()

        self.encoder = encoder

        vocab = encoder.get_vocabulary()
        
        self.embedding = tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=128,mask_zero=True)
        

        self.conv1 = tf.keras.layers.Conv1D(filters=conv1_filters,
                            kernel_size=conv1_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.conv2 = tf.keras.layers.Conv1D(filters=conv2_filters,
                            kernel_size=conv2_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            )
        self.global_pool = tf.keras.layers.GlobalMaxPool1D(keepdims=False)
        #self.dense1 = tf.keras.layers.Dense(dense1, activation='relu')
        self.dense2 = tf.keras.layers.Dense(3, activation="softmax")

    def call(self, x, training=False):
        emb = self.encoder(x)
        emb = self.embedding(emb)
        conv1 = self.conv1(emb)
        conv2 = self.conv2(emb)
        z = tf.concat([conv1, conv2], axis=2)
        z = self.global_pool(z)
        #z = self.dense1(z)
        z = self.dense2(z)
        return z

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)



    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)



class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim, mask_zero=True)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def create_model(conv1_filters, conv1_size, conv2_filters, conv2_size):
    model = CNN1d(conv1_filters, conv1_size, conv2_filters, conv2_size, encoder)
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy']
    )
    return model

def create_ngram():
    model_ngram = tf.keras.Sequential()
    model_ngram.add(Vectorizer)
      
    model_ngram.add(tf.keras.layers.Dense(128, activation='sigmoid'))
    model_ngram.add(tf.keras.layers.Dropout(0.5))
      
    model_ngram.add(tf.keras.layers.Dense(3, activation='softmax'))
      
    model_ngram.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])
    return model_ngram

def create_lstm():
    LSTM = tf.keras.Sequential()
    LSTM.add(encoder)
    LSTM.add(tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=256,mask_zero=True))
      
    LSTM.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,dropout=0.5,return_sequences=True)))
    LSTM.add(tf.keras.layers.GlobalMaxPool1D())

    LSTM.add(tf.keras.layers.Dropout(0.1))
      
    LSTM.add(tf.keras.layers.Dense(3, activation='softmax'))
      
    LSTM.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])
    
    return LSTM

def create_ensemble():
    ensemble = tf.keras.Sequential()
    # for 3 model
    ensemble.add(tf.keras.layers.Dense(36, activation='sigmoid'))
    ensemble.add(tf.keras.layers.Dropout(0.2))

    ensemble.add(tf.keras.layers.Dense(3, activation='softmax'))
    #ensemble.add(tf.keras.layers.InputLayer())

    ensemble.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(1e-3),
                metrics=['accuracy'])

    return ensemble

def convert_sparce(sparse_tensor):
  
  row  = np.array(sparse_tensor.indices[:,0])
  col  = np.array(sparse_tensor.indices[:,1])
  data = np.array(sparse_tensor.values)
  out = scipy.sparse.coo_matrix((data, (row, col)), shape=(sparse_tensor.shape.as_list()))

  return out

def create_transformer():
    sequence_length = 100
    max_features = 1000000
    # Token locations
    Vectorizer_transformer = tf.keras.layers.TextVectorization(max_tokens=max_features,output_sequence_length=sequence_length) 
    Vectorizer_transformer.adapt(np.hstack((X,df_kaggle['text'])))
    vocab = Vectorizer_transformer.get_vocabulary()
    vocab_size = len(vocab)


    embed_dim =32  # Embedding size for each token
    num_heads =1  # Number of attention heads
    ff_dim = 128  # Hidden layer size in feed forward network inside transformer
    maxlen = sequence_length
    dropout_rate = 0.2 # Dropout rate of feed forward network 

    ## Build embedding and transformer
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim,dropout_rate)

    ## Connect Keras Layers
    inputs = tf.keras.Input(shape=(1,), dtype=tf.string) 
    vec = Vectorizer_transformer(inputs)
    x = embedding_layer(vec)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(3, activation="softmax")(x)

    transformer = keras.Model(inputs=inputs, outputs=outputs) ##Final Model
    
    transformer.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])
    return transformer

def create_hybrid(conv_filters, conv_size, lstm_units):
    model = tf.keras.Sequential([
      encoder,
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=128,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Conv1D(filters=conv_filters,
                            kernel_size=conv_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            ),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True)),
    tf.keras.layers.GlobalMaxPool1D(keepdims=False),
    #tf.keras.layers.Dense(dense_units, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(3, activation="softmax")
    ])
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(1e-3),
        metrics=['accuracy'])
    return model

def create_hybrid2(conv_filters, conv_size, lstm_units, dense_units):
    model = tf.keras.Sequential([
      encoder,
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, dropout=0.2,return_sequences=True)),

    tf.keras.layers.Conv1D(filters=conv_filters,
                            kernel_size=conv_size,
                            padding="same",
                            activation="relu",
                            data_format="channels_last",
                            ),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalMaxPool1D(keepdims=False),
    #tf.keras.layers.Dense(dense_units, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation="softmax")
    ])
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(1e-3),
        metrics=['accuracy'])
    return model

In [None]:
max_features = 1000000
tfidf_vec = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode='tf_idf', sparse=True, ngrams=3)

with tf.device('/device:CPU:0'):
    tfidf_vec.adapt(np.hstack((X,df_kaggle['text'])))


tdidf = tf.keras.Sequential([
    tfidf_vec])
count = tf.keras.Sequential([
                             
                             
    count_vec])
df = pd.DataFrame(columns = ['model', 'average', 'logloss'])

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')
len(stop)
s_upper = []
for s in stop:
    s_upper.append(s[0].upper()+s[1:])

all_stop = stop + s_upper

len(all_stop)

In [None]:
df = pd.DataFrame(columns = ['model', 'average', 'logloss'])
call = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=1,
    mode='auto', restore_best_weights=True
)

In [None]:
kf = KFold(n_splits=5)
n  = 0

X_all_train = np.array([])
X_all_test = np.array([])

for train_index, test_index in kf.split(X):
    for _ in range(1):

        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y[train_index]
        y_test = y[test_index]

        X_train_stop = X_train.apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stop)]))
        X_test_stop = X_test.apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stop)]))
        #X_train = X_train_stop
        #X_test = X_test_stop


        x_train_sparce = tdidf.predict(X_train)
        x_test_sparce = tdidf.predict(X_test)
        x_train_sparce_count = count.predict(X_train)
        x_test_sparce_count = count.predict(X_test)

        x_train_sparce_stop = tdidf.predict(X_train_stop)
        x_test_sparce_stop = tdidf.predict(X_test_stop)
        x_train_sparce_count_stop = count.predict(X_train_stop)
        x_test_sparce_count_stop = count.predict(X_test_stop)

        train_count_data = convert_sparce(x_train_sparce_count)
        test_count_data = convert_sparce(x_test_sparce_count)
        train_count_data_stop = convert_sparce(x_train_sparce_count_stop)
        test_count_data_stop = convert_sparce(x_test_sparce_count_stop)

        cnn = create_model(64, 128, 32, 32) ##conv1_filters, conv1_size, conv2_filters, conv2_size
        ngram = create_ngram()
        LSTM = create_lstm()
        transformer = create_transformer()
        hybrid = create_hybrid(64, 32, 64) ## conv_filters, conv_size, lstm_units
        ensemble = create_ensemble()
        ensemble_with_tdidf = create_ensemble()
        multi_nb = MultinomialNB(alpha=1.5)

        multi_nb.fit(train_count_data, np.argmax(y_train, axis =1))

        cnn.fit(X_train, y_train, epochs=100, batch_size=256,validation_data= (X_test, y_test), callbacks=[call])

        ngram.fit(X_train, y_train, epochs=100, batch_size=64,validation_data= (X_test, y_test), callbacks=[call])

        LSTM.fit(X_train, y_train, epochs=100, batch_size=64,validation_data= (X_test, y_test), callbacks=[call])
        transformer.fit(X_train, y_train, epochs=100, batch_size=128,validation_data= (X_test, y_test), callbacks=[call])

        hybrid.fit(X_train, y_train, epochs=100, batch_size=32,validation_data= (X_test, y_test), callbacks=[call])

        cnn_pred = cnn.predict(X_train)
        ngram_pred = ngram.predict(X_train)
        LSTM_pred = LSTM.predict(X_train)
        transformer_pred = transformer.predict(X_train)
        hybrid_pred = hybrid.predict(X_train)

        cnn_pred_test = cnn.predict(X_test)
        ngram_pred_test = ngram.predict(X_test)
        LSTM_pred_test = LSTM.predict(X_test)
        transformer_pred_test = transformer.predict(X_test)
        hybrid_pred_test = hybrid.predict(X_test)


        multi_nb_pred = multi_nb.predict_proba(train_count_data)
        multi_nb_pred_test = multi_nb.predict_proba(test_count_data)


        com_nb = ComplementNB(alpha = 0.9)
        com_nb.fit(train_count_data, np.argmax(y_train, axis =1))
        com_nb_pred = com_nb.predict_proba(train_count_data)
        com_nb_pred_test = com_nb.predict_proba(test_count_data)
        com_nb_logloss = log_loss(y_test,com_nb_pred_test)
        com_nb_acc = np.sum(np.argmax(y_test, axis = 1) == com_nb.predict(test_count_data))/len(np.argmax(y_test, axis = 1))

        
        
        cnn_stop = create_model(64, 128, 32, 32) ##conv1_filters, conv1_size, conv2_filters, conv2_size
        ngram_stop = create_ngram()
        LSTM_stop = create_lstm()
        transformer_stop = create_transformer()
        hybrid_stop = create_hybrid(64, 32, 64) ## conv_filters, conv_size, lstm_units
        ensemble_stop = create_ensemble()
        ensemble_with_tdidf_stop = create_ensemble()
        multi_nb_stop = MultinomialNB(alpha=1.5)

        multi_nb_stop.fit(train_count_data_stop, np.argmax(y_train, axis =1))

        cnn_stop.fit(X_train_stop, y_train, epochs=100, batch_size=256,validation_data= (X_test_stop, y_test), callbacks=[call])

        ngram_stop.fit(X_train_stop, y_train, epochs=100, batch_size=64,validation_data= (X_test_stop, y_test), callbacks=[call])

        LSTM_stop.fit(X_train_stop, y_train, epochs=100, batch_size=64,validation_data= (X_test_stop, y_test), callbacks=[call])
        transformer_stop.fit(X_train_stop, y_train, epochs=100, batch_size=128,validation_data= (X_test_stop, y_test), callbacks=[call])

        hybrid_stop.fit(X_train_stop, y_train, epochs=100, batch_size=32,validation_data= (X_test_stop, y_test), callbacks=[call])

        cnn_pred_stop = cnn_stop.predict(X_train_stop)
        ngram_pred_stop = ngram_stop.predict(X_train_stop)
        LSTM_pred_stop = LSTM_stop.predict(X_train_stop)
        transformer_pred_stop = transformer_stop.predict(X_train_stop)
        hybrid_pred_stop = hybrid_stop.predict(X_train_stop)

        cnn_pred_test_stop = cnn_stop.predict(X_test_stop)
        ngram_pred_test_stop = ngram_stop.predict(X_test_stop)
        LSTM_pred_test_stop = LSTM_stop.predict(X_test_stop)
        transformer_pred_test_stop = transformer.predict(X_test_stop)
        hybrid_pred_test_stop = hybrid_stop.predict(X_test_stop)


        multi_nb_pred_stop = multi_nb_stop.predict_proba(train_count_data_stop)
        multi_nb_pred_test_stop = multi_nb_stop.predict_proba(test_count_data_stop)


        com_nb_stop = ComplementNB(alpha = 0.9)
        com_nb_stop.fit(train_count_data_stop, np.argmax(y_train, axis =1))
        com_nb_pred_stop = com_nb_stop.predict_proba(train_count_data_stop)
        com_nb_pred_test_stop = com_nb_stop.predict_proba(test_count_data_stop)
        com_nb_logloss_stop = log_loss(y_test,com_nb_pred_test_stop)
        com_nb_acc_stop = np.sum(np.argmax(y_test, axis = 1) == com_nb_stop.predict(test_count_data_stop))/len(np.argmax(y_test, axis = 1))



        X_train_ens = np.hstack([ngram_pred,cnn_pred,LSTM_pred,transformer_pred, hybrid_pred,multi_nb_pred,com_nb_pred,
    ngram_pred_stop,cnn_pred_stop,LSTM_pred_stop,transformer_pred_stop, hybrid_pred_stop,multi_nb_pred_stop,com_nb_pred_stop])
        
        X_test_ens = np.hstack([ngram_pred_test,cnn_pred_test,LSTM_pred_test,transformer_pred_test,hybrid_pred_test,multi_nb_pred_test,com_nb_pred_test,
ngram_pred_test_stop,cnn_pred_test_stop,LSTM_pred_test_stop,transformer_pred_test_stop,hybrid_pred_test_stop,multi_nb_pred_test_stop,com_nb_pred_test_stop])

        X_train_final_tensor = tf.sparse.from_dense(X_train_ens)
        X_test_final_tensor = tf.sparse.from_dense(X_test_ens)
        #X_train_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_train_sparce, tf.float64),tf.dtypes.cast(x_train_sparce_stop, tf.float64), X_train_final_tensor])
        #X_test_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_test_sparce, tf.float64),tf.dtypes.cast(x_test_sparce_stop, tf.float64), X_test_final_tensor])
        #X_train_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_train_sparce, tf.float64), X_train_final_tensor])
        #X_test_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_test_sparce, tf.float64), X_test_final_tensor])

        #ensemble.fit(X_train_ens, y_train, epochs=2, batch_size=128)
        #ensemble_with_tdidf.fit(X_train_concat_tensor, y_train, epochs=2, batch_size=128)



        ngram_results = ngram.evaluate(X_test,y_test)
        LSTM_results =LSTM.evaluate(X_test,y_test)
        cnn_results =cnn.evaluate(X_test,y_test)
        transformer_results = transformer.evaluate(X_test,y_test)
        hybrid_results = hybrid.evaluate(X_test,y_test)
        #ensemble_results =ensemble.evaluate(X_test_ens,y_test)
        #ensemble_with_tdidf_results =ensemble_with_tdidf.evaluate(X_test_concat_tensor,y_test)

        multi_nb_logloss = log_loss(y_test,multi_nb_pred_test)
        multi_nb_acc = np.sum(np.argmax(y_test, axis = 1) == multi_nb.predict(test_count_data))/len(np.argmax(y_test, axis = 1))
        if _ == 0:
            X_all_train = X_train_ens
            X_all_test = X_test_ens
            continue
        X_all_train = np.hstack((X_all_train,X_train_ens))
        X_all_test = np.hstack((X_all_test,X_test_ens))

    if n == 0:
        break
    n+=1


In [None]:
print(ngram_results)
print(LSTM_results)
print(cnn_results )
print(transformer_results)
print(hybrid_results) 
#print(ensemble_results)
print(multi_nb_logloss)
print(com_nb_logloss)

In [None]:
print(X_train_all.shape)
X_train_final_tensor = tf.sparse.from_dense(X_train_ens)
X_test_final_tensor = tf.sparse.from_dense(X_test_ens)
X_train_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_train_sparce, tf.float64), X_train_final_tensor])
X_test_concat_tensor = tf.sparse.concat(1,[tf.dtypes.cast(x_test_sparce, tf.float64), X_test_final_tensor])

X_train_sp_all = tf.sparse.from_dense(X_all_train)
X_test_sp_all = tf.sparse.from_dense(X_all_test)
X_train_concat_all = tf.sparse.concat(1,[tf.dtypes.cast(x_train_sparce, tf.float64), X_train_sp_all])
X_test_concat_all = tf.sparse.concat(1,[tf.dtypes.cast(x_test_sparce, tf.float64), X_test_sp_all])

In [None]:
lin_reg = LinearRegression(fit_intercept=False, positive= True)
lin_reg.fit(X_train_ens, y_train)
linreg_logloss = log_loss(y_test,lin_reg.predict(X_test_ens))
linreg_acc = np.sum(np.argmax(y_test, axis = 1) == np.argmax(lin_reg.predict(X_test_ens), axis = 1))/len(np.argmax(y_test, axis = 1))

lin_reg_all = LinearRegression(fit_intercept=False, positive= True)
lin_reg_all.fit(X_all_train, y_train)
linreg_logloss_all = log_loss(y_test,lin_reg_all.predict(X_all_test))
linreg_acc_all = np.sum(np.argmax(y_test, axis = 1) == np.argmax(lin_reg_all.predict(X_all_test), axis = 1))/len(np.argmax(y_test, axis = 1))

print(linreg_logloss)
print(linreg_logloss_all)

In [None]:
X_train_pre = convert_sparce(x_train_sparce)
X_test_pre = convert_sparce(x_test_sparce)

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=10)
svd.fit(X_train_pre)
svd_train = np.hstack((svd.transform(X_train_pre),X_all_train))
svd_test = np.hstack((svd.transform(X_test_pre),X_all_test))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

d_tree = ExtraTreesClassifier(n_estimators=5000,criterion='entropy', max_depth=4)
#d_tree.fit(X_train_ens, np.argmax(y_train, axis =1))

print(svd_train.shape)
d_tree.fit(svd_train, y_train)

d_tree_pred = d_tree.predict_proba(svd_test)

#d_tree_logloss = log_loss(y_test,d_tree_pred)
d_tree_logloss = log_loss(y_test,np.transpose(np.array(d_tree.predict_proba(svd_test))[:,:,-1]))

#d_tree_acc = np.sum(np.argmax(y_test, axis = 1) == d_tree.predict(X_test_ens))/len(np.argmax(y_test, axis = 1))
d_tree_acc = np.sum(np.argmax(y_test, axis = 1) == np.argmax(np.transpose(np.array(d_tree.predict_proba(svd_test))[:,:,-1]),axis =1 ))/len(np.argmax(y_test, axis = 1))

print(d_tree_logloss,d_tree_acc)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

d_tree = ExtraTreesClassifier(n_estimators=5000,criterion='entropy', max_depth=4)
#d_tree.fit(X_train_ens, np.argmax(y_train, axis =1))

print(X_all_train.shape)
d_tree.fit(X_all_train, y_train)

d_tree_pred = d_tree.predict_proba(X_all_test)

#d_tree_logloss = log_loss(y_test,d_tree_pred)
d_tree_logloss = log_loss(y_test,np.transpose(np.array(d_tree.predict_proba(X_all_test))[:,:,-1]))

#d_tree_acc = np.sum(np.argmax(y_test, axis = 1) == d_tree.predict(X_test_ens))/len(np.argmax(y_test, axis = 1))
d_tree_acc = np.sum(np.argmax(y_test, axis = 1) == np.argmax(np.transpose(np.array(d_tree.predict_proba(X_all_test))[:,:,-1]),axis =1 ))/len(np.argmax(y_test, axis = 1))

print(d_tree_logloss,d_tree_acc)

In [None]:
e2 = create_ensemble()
e3 = create_ensemble()
e4 = create_ensemble()
e5 = create_ensemble()

  
e2.fit(X_train_ens, y_train, epochs=2, batch_size=128, validation_data= (X_test_ens, y_test))
e3.fit(X_all_train, y_train, epochs=2, batch_size=128, validation_data= (X_all_test, y_test))
e4.fit(X_train_concat_tensor, y_train, epochs=2, batch_size=128,validation_data= (X_test_concat_tensor, y_test))
e5.fit(X_train_concat_all, y_train, epochs=3, batch_size=256,validation_data= (X_test_concat_all, y_test))

In [None]:
ngram_wrong = np.argmax(ngram_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_ngram = np.argwhere(ngram_wrong).reshape(len(np.argwhere(ngram_wrong)),)
cnn_wrong = np.argmax(cnn_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_cnn = np.argwhere(cnn_wrong).reshape(len(np.argwhere(cnn_wrong)),)
lstm_wrong = np.argmax(LSTM_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_lstm = np.argwhere(lstm_wrong).reshape(len(np.argwhere(lstm_wrong)),)
transformer_wrong = np.argmax(transformer_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_trans = np.argwhere(transformer_wrong).reshape(len(np.argwhere(transformer_wrong)),)
hybrid_wrong = np.argmax(hybrid_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_hybrid = np.argwhere(hybrid_wrong).reshape(len(np.argwhere(hybrid_wrong)),)
multi_nb_wrong = np.argmax(multi_nb_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_multi_nb = np.argwhere(multi_nb_wrong).reshape(len(np.argwhere(multi_nb_wrong)),)
com_nb_wrong = np.argmax(com_nb_pred_test, axis = 1) != np.argmax(y_test, axis = 1)
ind_com_nb = np.argwhere(com_nb_wrong).reshape(len(np.argwhere(com_nb_wrong)),)


all_wrong = set(ind_ngram).intersection(set(ind_cnn),set(ind_lstm),set(ind_hybrid),set(ind_trans),set(ind_multi_nb),set(ind_com_nb))
print(len(all_wrong)/len(y_test))


ngram_wrong_stop = np.argmax(ngram_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_ngram_stop = np.argwhere(ngram_wrong_stop).reshape(len(np.argwhere(ngram_wrong_stop)),)
cnn_wrong_stop = np.argmax(cnn_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_cnn_stop = np.argwhere(cnn_wrong_stop).reshape(len(np.argwhere(cnn_wrong_stop)),)
lstm_wrong_stop = np.argmax(LSTM_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_lstm_stop = np.argwhere(lstm_wrong_stop).reshape(len(np.argwhere(lstm_wrong_stop)),)
transformer_wrong_stop = np.argmax(transformer_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_trans_stop = np.argwhere(transformer_wrong_stop).reshape(len(np.argwhere(transformer_wrong_stop)),)
hybrid_wrong_stop = np.argmax(hybrid_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_hybrid_stop = np.argwhere(hybrid_wrong_stop).reshape(len(np.argwhere(hybrid_wrong_stop)),)
multi_nb_wrong_stop = np.argmax(multi_nb_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_multi_nb_stop = np.argwhere(multi_nb_wrong_stop).reshape(len(np.argwhere(multi_nb_wrong_stop)),)
com_nb_wrong_stop = np.argmax(com_nb_pred_test_stop, axis = 1) != np.argmax(y_test, axis = 1)
ind_com_nb_stop = np.argwhere(com_nb_wrong_stop).reshape(len(np.argwhere(com_nb_wrong_stop)),)

In [None]:
all_wrong = set(ind_ngram).intersection(set(ind_cnn),set(ind_lstm),set(ind_hybrid),set(ind_trans),set(ind_multi_nb),set(ind_com_nb))
print(len(all_wrong)/len(y_test))

all_wrong_stop = set(ind_ngram_stop).intersection(set(ind_cnn_stop),set(ind_lstm_stop),set(ind_hybrid_stop),set(ind_trans_stop),set(ind_multi_nb_stop),set(ind_com_nb_stop))
print(len(all_wrong_stop)/len(y_test))
print(len(set(all_wrong).intersection(set(all_wrong_stop)))/len(y_test))

In [None]:
from xgboost.sklearn import XGBClassifier

#test_train = np.hstack((X_train_ens,ensemble_with_tdidf.predict(X_train_concat_tensor)))
#test_test= np.hstack((X_test_ens,ensemble_with_tdidf.predict(X_test_concat_tensor)))


eval_set = [(svd_train, np.argmax(y_train,axis = 1)), (svd_test, np.argmax(y_test,axis = 1))]

xgb = XGBClassifier( colsample_bytree = .01,
                          subsample = .8,
                          learning_rate = 0.1,
                          max_depth = 4,
                          num_class =3,
                          objective ='multi:softprob',

                          n_estimators =5000,)

xgb.fit(svd_train, np.argmax(y_train,axis = 1),  early_stopping_rounds=50, eval_metric=[ "mlogloss"], eval_set=eval_set,verbose=2000)

xgb_pred = xgb.predict_proba(svd_test)
xgb_logloss = log_loss(y_test,xgb_pred)
xgb_acc = np.sum(np.argmax(y_test, axis = 1) == xgb.predict(svd_test))/len(np.argmax(y_test, axis = 1))

print(xgb_acc,xgb_logloss)