In [1]:
from zeugma.embeddings import EmbeddingTransformer

In [2]:
import tensorflow as tf
tf.__version__

'2.4.0'

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [4]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing import *
from function import *

In [6]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV

In [7]:
df_train = pd.read_csv("./Data/Hateval/hateval2019_en_train.csv").dropna()
df_val = pd.read_csv("./Data/Hateval/hateval2019_en_dev.csv").dropna()
df_test = pd.read_csv("./Data/Hateval/hateval2019_en_test.csv").dropna()

In [8]:
train, class_train = df_train['text'].apply(pre_processing), df_train['HS']
val, class_val = df_val['text'].apply(pre_processing), df_val['HS']
test, class_test = df_test['text'].apply(pre_processing), df_test['HS']

In [9]:
w2v = EmbeddingTransformer('word2vec')

In [10]:
glove = EmbeddingTransformer('glove')

In [11]:
fasttext = EmbeddingTransformer('fasttext')

In [12]:
cv = CountVectorizer(analyzer='word', lowercase=True, stop_words='english')
cv.fit_transform(train.values.astype('U'))
     
tfidf =  TfidfVectorizer(analyzer='word', lowercase=True, use_idf=True, stop_words='english')
tfidf.fit_transform(train.values.astype('U'))

<9000x16056 sparse matrix of type '<class 'numpy.float64'>'
	with 93433 stored elements in Compressed Sparse Row format>

## Prediction SVM base model

In [13]:
svm = {
    'CV': {
        'CLF': SVC(random_state=42, kernel='linear', gamma=0.1, probability=True),
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': SVC(random_state=42, kernel='linear', gamma=0.1, probability=True),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': SVC(random_state=42, kernel='rbf', gamma=1, probability=True),
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': SVC(random_state=42, kernel='rbf', gamma=0.5, probability=True),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': SVC(random_state=42, kernel='rbf', gamma=1, probability=True),
        'EXT': fasttext,
    }
}

In [14]:
svm_clfs = {

}
for ext, clf in svm.items():
    svm_clfs[ext] = get_classifier(clf['CLF'], train, class_train, clf['EXT'])

In [15]:
df_pred_train = pd.DataFrame(df_train['HS'])
df_pred_val = pd.DataFrame(df_val['HS'])
df_pred_test = pd.DataFrame(df_test['HS'])

df_prob_val = pd.DataFrame(df_val['HS'])
df_prob_train = pd.DataFrame(df_train['HS'])
df_prob_test = pd.DataFrame(df_test['HS'])

for ext, clf in svm_clfs.items():
  # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["SVM-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["SVM-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["SVM-{}".format(ext)])
  
    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)
  
  # # Probabilidades
    cols = [
      "SVM-{}-{}".format(ext,clf.classes_[0]), 
      "SVM-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


## Prediction LR model

In [16]:
lr = {
    'CV': {
        'CLF': LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', penalty='l1'),
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', penalty='l1'),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', penalty='l1'),
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', penalty='l1'),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', penalty='l1'),
        'EXT': fasttext,
    }
}

In [17]:
lr_clfs = {

}
for ext, clf in lr.items():
    lr_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])

In [18]:
for ext, clf in lr_clfs.items():
  # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["LR-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["LR-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["LR-{}".format(ext)])
  
    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

    # # Probabilidades
    cols = [
      "LR-{}-{}".format(ext,clf.classes_[0]), 
      "LR-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)

## Prediction Random Forest

With calibration for good stacking at the end.

In [19]:
rf = {
    'CV': {
        'CLF': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
                                      
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT': fasttext,
    }
}

In [20]:
rf_clfs = {

}
for ext, clf in rf.items():
    rf_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])

In [21]:
for ext, clf in rf_clfs.items():
    # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["RF-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["RF-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["RF-{}".format(ext)])
  
    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

    # # Probabilidades
    cols = [
      "RF-{}-{}".format(ext,clf.classes_[0]), 
      "RF-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


### PRediction Naive Bayes

In [22]:
nb = {
    'CV': {
        'CLF': MultinomialNB(alpha=1, fit_prior=False),
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': MultinomialNB(alpha=0.5, fit_prior=False),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': BernoulliNB(alpha=0.5, fit_prior=True),
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': BernoulliNB(alpha=0.1, fit_prior=True),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': BernoulliNB(alpha=1, fit_prior=True),
        'EXT': fasttext,
    }
}

In [23]:
nb_clfs = {

}
for ext, clf in nb.items():
    nb_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])

In [24]:
for ext, clf in nb_clfs.items():
    # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["NB-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["NB-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["NB-{}".format(ext)])
  
    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

  # # Probabilidades
    cols = [
      "NB-{}-{}".format(ext,clf.classes_[0]), 
      "NB-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)

### PRediction MLP

In [25]:
mlp = {
    'CV': {
        'CLF': MLPClassifier(random_state=42, batch_size=64, max_iter=100, activation='relu', solver='lbfgs'),
        'EXT': cv,

    },
    'TFIDF': {
        'CLF': MLPClassifier(random_state=42, batch_size=64, max_iter=100, activation='logistic', solver='adam'),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': MLPClassifier(random_state=42, batch_size=64, max_iter=100, activation='relu', solver='adam'),
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': MLPClassifier(random_state=42, batch_size=64, max_iter=100, activation='relu', solver='adam'),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': MLPClassifier(random_state=42, batch_size=20, max_iter=20, activation='relu', solver='adam'),
        'EXT': fasttext,
    }
}

In [26]:
mlp_clfs = {

}
for ext, clf in mlp.items():
    mlp_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])



In [27]:
for ext, clf in mlp_clfs.items():
    # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["MLP-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["MLP-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["MLP-{}".format(ext)])

    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

    # # Probabilidades
    cols = [
        "MLP-{}-{}".format(ext,clf.classes_[0]), 
        "MLP-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


## Prediction Extra

In [32]:
extra = {
    'CV': {
        'CLF': ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1), 
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1), 
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1),
        'EXT': fasttext,
    }
}

In [33]:
extra_clfs = {

}
for ext, clf in extra.items():
    extra_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])


In [34]:
for ext, clf in extra_clfs.items():
    # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["EXTRA-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["EXTRA-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["EXTRA-{}".format(ext)])

    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

    # # Probabilidades
    cols = [
        "EXTRA-{}-{}".format(ext,clf.classes_[0]), 
        "EXTRA-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


In [35]:
knn = {
    'CV': {
        'CLF': KNeighborsClassifier(n_neighbors=3, algorithm='auto', n_jobs=-1), 
        'EXT': cv,
    },
    'TFIDF': {
        'CLF': KNeighborsClassifier(n_neighbors=3, algorithm='auto', n_jobs=-1),
        'EXT': tfidf,
    },
    'W2V': {
        'CLF': KNeighborsClassifier(n_neighbors=5, algorithm='auto', n_jobs=-1), 
        'EXT': w2v,
    },
    'GLOVE': {
        'CLF': KNeighborsClassifier(n_neighbors=5, algorithm='auto', n_jobs=-1),
        'EXT':  glove,
    },
    'FAST': {
        'CLF': KNeighborsClassifier(n_neighbors=5, algorithm='auto', n_jobs=-1), 
        'EXT': fasttext,
    }
}

In [36]:
knn_clfs = { }
for ext, clf in knn.items():
    knn_clfs[ext] = get_classifier(clf['CLF'], train, df_train['HS'], clf['EXT'])

In [37]:
for ext, clf in knn_clfs.items():
    # Predict
    df_train_ = pd.DataFrame(clf.predict(train), columns=["KNN-{}".format(ext)])
    df_val_ = pd.DataFrame(clf.predict(val), columns=["KNN-{}".format(ext)])
    df_test_ = pd.DataFrame(clf.predict(test), columns=["KNN-{}".format(ext)])

    df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
    df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
    df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

    # # Probabilidades
    cols = [
        "KNN-{}-{}".format(ext,clf.classes_[0]), 
        "KNN-{}-{}".format(ext,clf.classes_[1])
    ]
    df_train_ = pd.DataFrame(clf.predict_proba(train), columns=cols)
    df_val_ = pd.DataFrame(clf.predict_proba(val), columns=cols)
    df_test_ = pd.DataFrame(clf.predict_proba(test), columns=cols)
    df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
    df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
    df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


# Prediction CNN

In [38]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH=300

y_train = to_categorical(df_train['HS'])
y_val = to_categorical(df_val['HS'])

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train)

seq_train = tokenizer.texts_to_sequences(train)
seq_val = tokenizer.texts_to_sequences(val)
seq_test = tokenizer.texts_to_sequences(test)

data_train = pad_sequences(seq_train, maxlen=MAX_SEQUENCE_LENGTH)
data_val = pad_sequences(seq_val, maxlen=MAX_SEQUENCE_LENGTH)
data_test = pad_sequences(seq_test, maxlen=MAX_SEQUENCE_LENGTH)

In [39]:
cnn_cv = get_CNN(cv, tokenizer, MAX_NB_WORDS, EMBEDDING_DIM=300, activation='sigmoid', dense=2)
cnn_tfidf = get_CNN(tfidf, tokenizer, MAX_NB_WORDS, EMBEDDING_DIM=300, activation='softmax', dense=2)
cnn_w2v = get_CNN(w2v, tokenizer, MAX_NB_WORDS, EMBEDDING_DIM=300, activation='sigmoid', word_embedding=True, dense=2)
cnn_glove = get_CNN(glove, tokenizer, MAX_NB_WORDS, EMBEDDING_DIM=25, activation='sigmoid', word_embedding=True, dense=2)
cnn_fast = get_CNN(fasttext, tokenizer, MAX_NB_WORDS, EMBEDDING_DIM=300, activation='sigmoid', word_embedding=True, dense=2)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          5142600   
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 298, 64)           57664     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 256)               16640     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation (Activation)      (None, 256)               0

In [40]:
cnn_cv.fit(data_train, y_train, validation_data=(data_val, y_val), epochs=20, batch_size=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x251f508eb70>

In [41]:
cnn_tfidf.fit(data_train, y_train, validation_data=(data_val, y_val), epochs=20, batch_size=200)
cnn_w2v.fit(data_train, y_train, validation_data=(data_val, y_val), epochs=20, batch_size=200)
cnn_glove.fit(data_train, y_train, validation_data=(data_val, y_val), epochs=20, batch_size=200)
cnn_fast.fit(data_train, y_train, validation_data=(data_val, y_val), epochs=20, batch_size=200)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x252268025c0>

In [42]:
# df_pred_test = df_pred_test.drop(['CNN-CV'], axis=1)
# print(df_pred_test.columns)
# CV
cols = ["CNN-CV-0", "CNN-CV-1"]
df_train_ = pd.DataFrame(np.argmax(cnn_cv.predict(data_train), axis=1), columns=["CNN-CV"])
df_val_ = pd.DataFrame(np.argmax(cnn_cv.predict(data_val), axis=1), columns=["CNN-CV"])
df_test_ = pd.DataFrame(np.argmax(cnn_cv.predict(data_test), axis=1), columns=["CNN-CV"])
  
df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

# print(df_pred_test.columns)

# # Probabilidades
  
df_train_ = pd.DataFrame(cnn_cv.predict(data_train), columns=cols)
df_val_ = pd.DataFrame(cnn_cv.predict(data_val), columns=cols)
df_test_ = pd.DataFrame(cnn_cv.predict(data_test), columns=cols)

df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


cols = ["CNN-TFIDF-0", "CNN-TFIDF-1"]

df_train_ = pd.DataFrame(np.argmax(cnn_tfidf.predict(data_train), axis=1), columns=["CNN-TFIDF"])
df_val_ = pd.DataFrame(np.argmax(cnn_tfidf.predict(data_val), axis=1), columns=["CNN-TFIDF"])
df_test_ = pd.DataFrame(np.argmax(cnn_tfidf.predict(data_test), axis=1), columns=["CNN-TFIDF"])
  
df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

# Probabilidades
  
df_train_ = pd.DataFrame(cnn_tfidf.predict(data_train), columns=cols)
df_val_ = pd.DataFrame(cnn_tfidf.predict(data_val), columns=cols)
df_test_ = pd.DataFrame(cnn_tfidf.predict(data_test), columns=cols)

df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)

cols = ["CNN-W2V-0", "CNN-W2V-1"]

df_train_ = pd.DataFrame(np.argmax(cnn_w2v.predict(data_train), axis=1), columns=["CNN-W2V"])
df_val_ = pd.DataFrame(np.argmax(cnn_w2v.predict(data_val), axis=1), columns=["CNN-W2V"])
df_test_ = pd.DataFrame(np.argmax(cnn_w2v.predict(data_test), axis=1), columns=["CNN-W2V"])
  
df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

# Probabilidades
  
df_train_ = pd.DataFrame(cnn_w2v.predict(data_train), columns=cols)
df_val_ = pd.DataFrame(cnn_w2v.predict(data_val), columns=cols)
df_test_ = pd.DataFrame(cnn_w2v.predict(data_test), columns=cols)

df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


cols = ["CNN-GLOVE-0", "CNN-GLOVE-1"]

df_train_ = pd.DataFrame(np.argmax(cnn_glove.predict(data_train), axis=1), columns=["CNN-GLOVE"])
df_val_ = pd.DataFrame(np.argmax(cnn_glove.predict(data_val), axis=1), columns=["CNN-GLOVE"])
df_test_ = pd.DataFrame(np.argmax(cnn_glove.predict(data_test), axis=1), columns=["CNN-GLOVE"])

df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

# Probabilidades
  
df_train_ = pd.DataFrame(cnn_glove.predict(data_train), columns=cols)
df_val_ = pd.DataFrame(cnn_glove.predict(data_val), columns=cols)
df_test_ = pd.DataFrame(cnn_glove.predict(data_test), columns=cols)

df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)

cols = ["CNN-FAST-0", "CNN-FAST-1"]

df_train_ = pd.DataFrame(np.argmax(cnn_fast.predict(data_train), axis=1), columns=["CNN-FAST"])
df_val_ = pd.DataFrame(np.argmax(cnn_fast.predict(data_val), axis=1), columns=["CNN-FAST"])
df_test_ = pd.DataFrame(np.argmax(cnn_fast.predict(data_test), axis=1), columns=["CNN-FAST"])

df_pred_train = pd.concat([df_pred_train, df_train_], axis=1, sort=False)
df_pred_val = pd.concat([df_pred_val, df_val_], axis=1, sort=False)
df_pred_test = pd.concat([df_pred_test, df_test_], axis=1, sort=False)

# Probabilidades
  
df_train_ = pd.DataFrame(cnn_fast.predict(data_train), columns=cols)
df_val_ = pd.DataFrame(cnn_fast.predict(data_val), columns=cols)
df_test_ = pd.DataFrame(cnn_fast.predict(data_test), columns=cols)

df_prob_train = pd.concat([df_prob_train, df_train_], axis=1, sort=False)
df_prob_val = pd.concat([df_prob_val, df_val_], axis=1, sort=False)
df_prob_test = pd.concat([df_prob_test, df_test_], axis=1, sort=False)


In [43]:
df_pred_train.to_csv("./Saved_Predict_and_Proba/HATEVAL/pred_train_hateval.csv")
df_pred_val.to_csv("./Saved_Predict_and_Proba/HATEVAL/pred_val_hateval.csv")
df_pred_test.to_csv("./Saved_Predict_and_Proba/HATEVAL/pred_test_hateval.csv")
df_prob_train.to_csv("./Saved_Predict_and_Proba/HateVal/prob_train_hateval.csv")
df_prob_val.to_csv("./Saved_Predict_and_Proba/HateVal/prob_val_hateval.csv")
df_prob_test.to_csv("./Saved_Predict_and_Proba/HateVal/prob_test_hateval.csv")

# Analyzing Results

In [1]:
import numpy as np
from pprint import pprint
import pandas as pd
from sklearn.metrics import f1_score
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']

## Validation data

In [2]:
val_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_val_hateval.csv").dropna()
probas_val, labels_val = val_df.drop(columns=["Unnamed: 0", "HS"]), val_df["HS"]
results_f1_val = np.zeros((len(algorithms_list), len(fe_list)))
for idx_alg, alg in enumerate(algorithms_list):
    for idx_fe, fe in enumerate(fe_list):
        probas = probas_val.filter(regex=alg+'-'+fe)
        y_pred = np.argmax(probas.to_numpy(), axis=1)
        results_f1_val[idx_alg, idx_fe] = f1_score(labels_val, y_pred)

In [3]:
probas

Unnamed: 0,NB-FAST-0,NB-FAST-1
0,4.610015e-01,5.389985e-01
1,1.607912e-01,8.392088e-01
2,3.348711e-01,6.651289e-01
3,9.994668e-01,5.331676e-04
4,9.999996e-01,3.969834e-07
...,...,...
995,3.316726e-05,9.999668e-01
996,7.580322e-08,9.999999e-01
997,6.568345e-04,9.993432e-01
998,2.883085e-06,9.999971e-01


## Test data

In [3]:
test_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_test_hateval.csv").dropna()
probas_test, labels_test = test_df.drop(columns=["Unnamed: 0", "HS"]), test_df["HS"]
results_f1_test = np.zeros((len(algorithms_list), len(fe_list)))
for idx_alg, alg in enumerate(algorithms_list):
    for idx_fe, fe in enumerate(fe_list):
        probas = probas_test.filter(regex=alg+'-'+fe)
        y_pred = np.argmax(probas.to_numpy(), axis=1)
        results_f1_test[idx_alg, idx_fe] = f1_score(labels_test, y_pred)

In [4]:
results_df_test = pd.DataFrame(results_f1_test, columns=fe_list, index=algorithms_list)
results_df_val = pd.DataFrame(results_f1_val, columns=fe_list, index=algorithms_list)

In [5]:
pprint(results_df_test.to_latex())

('\\begin{tabular}{lrrrrr}\n'
 '\\toprule\n'
 '{} &        CV &     TFIDF &       W2V &     GLOVE &      FAST \\\\\n'
 '\\midrule\n'
 'SVM   &  0.604611 &  0.603842 &  0.627646 &  0.561172 &  0.617781 \\\\\n'
 'MLP   &  0.598499 &  0.587328 &  0.615523 &  0.575249 &  0.627669 \\\\\n'
 'KNN   &  0.485299 &  0.113573 &  0.585816 &  0.537245 &  0.566645 \\\\\n'
 'RF    &  0.586481 &  0.586640 &  0.598967 &  0.558888 &  0.597227 \\\\\n'
 'EXTRA &  0.591253 &  0.593182 &  0.594798 &  0.565487 &  0.596252 \\\\\n'
 'CNN   &  0.582179 &  0.559368 &  0.604710 &  0.569472 &  0.608629 \\\\\n'
 'LR    &  0.607941 &  0.596645 &  0.617062 &  0.503864 &  0.589726 \\\\\n'
 'NB    &  0.601139 &  0.603803 &  0.528418 &  0.499833 &  0.538870 \\\\\n'
 '\\bottomrule\n'
 '\\end{tabular}\n')


In [6]:
results_df_test.to_csv('Results HatEval.csv')