# Load Data

In [None]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')

# Train dataset / Validation dataset
df = pd.read_csv("/content/drive/MyDrive/HateSpeech/FINAL_DATASET/Final_dataset_balanced.csv")
df = df.dropna()
df_train, df_val = train_test_split(df,test_size=0.2,random_state = 42)
# Test dataset
df_test = pd.read_csv("/content/drive/MyDrive/HateSpeech/hateXplain.csv")
print(df_train.shape,df_val.shape,df_test.shape)

df_train = df_train[(df_train['text'].apply(len)<1000)]
df_val = df_val[(df_val['text'].apply(len)<1000)]
df_test = df_test[(df_test['text'].apply(len)<1000)]
print(df_train.shape,df_val.shape,df_test.shape)

(123424, 3) (30856, 3) (15351, 3)
(119868, 3) (29931, 3) (15351, 3)


# 1 GloVe + RF,LGBM

### GloVe

In [None]:
import numpy as np
embeddings_index = {}
f = open("/content/drive/MyDrive/HateSpeech/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
from keras.initializers import Constant
from keras.layers import *

X_train = df_train['text']
y_train = df_train['class']
X_val = df_val['text']
y_val = df_val['class']
X_test = df_test['text']
y_test = df_test['class']

# Initialization
max_features=100000
sequence_length = 235
embedding_dim = 100
num_words = 100001

# Tokenizing
data_start = time.time()
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(X_train)
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), sequence_length)

word_index = tokenizer.word_index
embedding_matrix_train = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_train[i] = embedding_vector
    else:
        embedding_matrix_train[i] = np.random.randn(embedding_dim)

embed_keras = Embedding(num_words,
                      embedding_dim,
                      embeddings_initializer=Constant(embedding_matrix_train),
                      input_length=sequence_length,
                      trainable=True)

X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), sequence_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), sequence_length)

X_train = embed_keras(X_train).numpy().reshape(X_train.shape[0],23500)
X_val = embed_keras(X_val).numpy().reshape(X_val.shape[0],23500)
X_test = embed_keras(X_test).numpy().reshape(X_test.shape[0],23500)

### RF,LGBM

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
model_list = []
model_list.append([RandomForestClassifier(max_depth=None,min_samples_split=4,n_estimators=100,oob_score=False,n_jobs=-1,verbose=10),
                  'RandomForestClassifier'])
model_list.append([LGBMClassifier(),'LGBMClassifier'])
model_list

[[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=4,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=None,
                         verbose=10, warm_start=False),
  'RandomForestClassifier'],
 [LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                 importance_type='split', learning_rate=0.1, max_depth=-1,
                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                 n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                 random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
             

## Evaluation

In [None]:
eval = pd.DataFrame([[np.nan for i in range(11)]])
eval.columns = ['Model',
                'Train_Score(ACC)','Train_Score(ROC_AUC)','Train_Score(F1)',
                'Val_Score(ACC)','Val_Score(ROC_AUC)','Val_Score(F1)',
                'Test_Score(ACC)','Test_Score(ROC_AUC)','Test_Score(F1)','Inference_Time']
eval = eval.iloc[1:]

for LR, model_name in model_list:

  # Fit
  print(f"{model_name}","-"*100)
  LR.fit(X_train,y_train)

  # Inference
  print("TRAIN SET")
  fitted = LR.predict(X_train)
  fitted_proba = LR.predict_proba(X_train)

  print("VAL SET")
  val_pred = LR.predict(X_val)
  val_pred_proba = LR.predict_proba(X_val)

  print("TEST SET")
  start = time.time()
  test_pred = LR.predict(X_test)
  inference_time = time.time()-start
  test_pred_proba = LR.predict_proba(X_test)
  print(f"Inferenced : {inference_time}s",end='\t')

  # Evaluate
  train_acc = accuracy_score(y_train,fitted)
  train_auc = roc_auc_score(y_train,fitted_proba[:,1])
  train_f1 = f1_score(y_train,fitted)

  val_acc = accuracy_score(y_val,val_pred)
  val_auc = roc_auc_score(y_val,val_pred_proba[:,1])
  val_f1 = f1_score(y_val,val_pred)

  test_acc = accuracy_score(y_test,test_pred)
  test_auc = roc_auc_score(y_test,test_pred_proba[:,1])
  test_f1 = f1_score(y_test,test_pred)
  print(f"TRAIN ROC_AUC : {train_auc} VAL ROC_AUC : {val_auc} TEST ROC_AUC : {test_auc}")

  LR_list = [f"{model_name}"]
  LR_list.append(train_acc)
  LR_list.append(train_auc)
  LR_list.append(train_f1)
  LR_list.append(val_acc)
  LR_list.append(val_auc)
  LR_list.append(val_f1)
  LR_list.append(test_acc)
  LR_list.append(test_auc)
  LR_list.append(test_f1)
  LR_list.append(inference_time)
  print(f"DONE!!! {time.time()-data_start}")
  eval = eval.append(pd.DataFrame([LR_list],columns=eval.columns))

RandomForestClassifier ----------------------------------------------------------------------------------------------------


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100

building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100building tree 34 of 100
building tree 35 of 100

building tree 36 of 100building tree 37 of 100
building tree 38 of 100

building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.1s


building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100building tree 54 of 100

building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   16.6s


building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100


[Parallel(n_jobs=-1)]: Done  32 out of 100 | elapsed:   17.2s remaining:   36.5s


building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100


[Parallel(n_jobs=-1)]: Done  43 out of 100 | elapsed:   33.1s remaining:   43.8s


building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100


[Parallel(n_jobs=-1)]: Done  54 out of 100 | elapsed:   33.8s remaining:   28.8s


building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=-1)]: Done  65 out of 100 | elapsed:   34.4s remaining:   18.5s
[Parallel(n_jobs=-1)]: Done  76 out of 100 | elapsed:   34.9s remaining:   11.0s
[Parallel(n_jobs=-1)]: Done  87 out of 100 | elapsed:   45.0s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  98 out of 100 | elapsed:   45.7s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   45.8s finished


TRAIN SET


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=40)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=40)]: Done  32 out of 100 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=40)]: Done  43 out of 100 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=40)]: Done  54 out of 100 | elapsed:    0.4s remaining:    0.3s
[Parallel(n_jobs=40)]: Done  65 out of 100 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=40)]: Done  76 out of 100 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  87 out of 100 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  98 out of 100 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_

VAL SET


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=40)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=40)]: Done  32 out of 100 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=40)]: Done  43 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  54 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  65 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  76 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done  87 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done  98 out of 100 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_

TEST SET


[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=40)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=40)]: Done  32 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  43 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  54 out of 100 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=40)]: Done  65 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done  76 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done  87 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done  98 out of 100 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_

Inferenced : 0.4736795425415039s	TRAIN ROC_AUC : 0.9998535635659462 VAL ROC_AUC : 0.8158714488156463 TEST ROC_AUC : 0.4780614791794996
DONE!!! 344.44000720977783
LGBMClassifier ----------------------------------------------------------------------------------------------------
TRAIN SET
VAL SET
TEST SET
Inferenced : 0.10757231712341309s	TRAIN ROC_AUC : 0.9042332544962901 VAL ROC_AUC : 0.8789744439192967 TEST ROC_AUC : 0.4840430921144571
DONE!!! 502.0091800689697


In [None]:
eval['Model'] = eval['Model']+'_GloVe_EMBED'

In [None]:
eval

Unnamed: 0,Model,Train_Score(ACC),Train_Score(ROC_AUC),Train_Score(F1),Val_Score(ACC),Val_Score(ROC_AUC),Val_Score(F1),Test_Score(ACC),Test_Score(ROC_AUC),Test_Score(F1),Inference_Time
0,RandomForestClassifier_GloVe_EMBED,0.998665,0.999854,0.998685,0.732919,0.815871,0.724687,0.444271,0.478061,0.334607,0.47368
0,LGBMClassifier_GloVe_EMBED,0.822096,0.904233,0.816286,0.794494,0.878974,0.781779,0.437235,0.484043,0.34913,0.107572


In [None]:
eval.to_csv(f"/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_GloVe.csv")

# 2 Build Bert Embedded Dataset (DistilBERT)

## BERT TOKENIZER

In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.7MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 64.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     

In [None]:
from transformers import ElectraTokenizer, TFElectraForSequenceClassification

tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator')
model.load_weights("/content/drive/MyDrive/HateSpeech/Weight/ELECTRA")
model = tf.keras.Sequential(model.layers[:-1])

Some layers from the model checkpoint at google/electra-small-discriminator were not used when initializing TFElectraForSequenceClassification: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### BUILD DATASET

In [None]:
import time

X_train = df_train['text'].to_list()
y_train = df_train['class'].to_list()
X_val = df_val['text'].to_list()
y_val = df_val['class'].to_list()
X_test = df_test['text'].to_list()
y_test = df_test['class'].to_list()

# Tokenizing
encoding_time = time.time()
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)
print("encoding time : ", time.time()-encoding_time)

encoding time :  109.29729127883911


In [None]:
# building dataset :  279.47532629966736
# building dataset :  337.6722948551178

In [None]:
# Build Dataset
dataset_time = time.time()
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
print("building dataset : ", time.time()-dataset_time)
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))
print("building dataset : ", time.time()-dataset_time)

building dataset :  353.823668718338
building dataset :  428.4043426513672


### SAVE DATASET

In [None]:
tf.data.experimental.save(
    train_dataset, "/content/drive/MyDrive/HateSpeech/traindataset_tf", compression='GZIP'
)
with open("/content/drive/MyDrive/HateSpeech/traindataset_tf" + '/element_spec', 'wb') as out_:  # also save the element_spec to disk for future loading
    pickle.dump(train_dataset.element_spec, out_)

tf.data.experimental.save(
    val_dataset, "/content/drive/MyDrive/HateSpeech/valdataset_tf", compression='GZIP'
)
with open("/content/drive/MyDrive/HateSpeech/valdataset_tf" + '/element_spec', 'wb') as out_:  # also save the element_spec to disk for future loading
    pickle.dump(val_dataset.element_spec, out_)

tf.data.experimental.save(
    test_dataset, "/content/drive/MyDrive/HateSpeech/testdataset_tf", compression='GZIP'
)
with open("/content/drive/MyDrive/HateSpeech/testdataset_tf" + '/element_spec', 'wb') as out_:  # also save the element_spec to disk for future loading
    pickle.dump(test_dataset.element_spec, out_)

### Load DATASET

In [None]:
import pickle
with open( "/content/drive/MyDrive/HateSpeech/traindataset_tf" + '/element_spec', 'rb') as in_:
    es = pickle.load(in_)

train_dataset = tf.data.experimental.load(
     "/content/drive/MyDrive/HateSpeech/traindataset_tf", es, compression='GZIP'
)

with open( "/content/drive/MyDrive/HateSpeech/valdataset_tf" + '/element_spec', 'rb') as in_:
    es = pickle.load(in_)

val_dataset = tf.data.experimental.load(
     "/content/drive/MyDrive/HateSpeech/valdataset_tf", es, compression='GZIP'
)

with open( "/content/drive/MyDrive/HateSpeech/testdataset_tf" + '/element_spec', 'rb') as in_:
    es = pickle.load(in_)

test_dataset = tf.data.experimental.load(
     "/content/drive/MyDrive/HateSpeech/testdataset_tf", es, compression='GZIP'
)

## EMBEDDING

### SAVE EMBEDDED DATASET

In [None]:
# save
i=0
start = time.time()
for pred in train_dataset.batch(64):
  if i%10==0 : 
    lef = len(df_train)/64
    print(i,'/',int(lef),end='\t')
    ela = time.time()-start
    print(round(ela,0),'s',end='\t')
    print(round(ela*(lef-i)/10,0),'s',end='\t')
    if i>0:print("SHAPE :",temp.shape,y_temp.shape)
    start = time.time()
  if i == 0:
    temp = model.predict(pred[0])[0][:,0,:]
    y_temp = pred[1]
    i+=1
  else :
    temp = np.append(temp,model.predict(pred[0])[0][:,0,:],axis=0)
    y_temp = np.append(y_temp,pred[1])
    i+=1

0 / 1872	0.0 s	29.0 s	10 / 1872	3.0 s	633.0 s	SHAPE : (640, 256) (640,)
20 / 1872	3.0 s	628.0 s	SHAPE : (1280, 256) (1280,)
30 / 1872	3.0 s	629.0 s	SHAPE : (1920, 256) (1920,)
40 / 1872	3.0 s	624.0 s	SHAPE : (2560, 256) (2560,)
50 / 1872	3.0 s	622.0 s	SHAPE : (3200, 256) (3200,)
60 / 1872	3.0 s	620.0 s	SHAPE : (3840, 256) (3840,)
70 / 1872	3.0 s	617.0 s	SHAPE : (4480, 256) (4480,)
80 / 1872	3.0 s	614.0 s	SHAPE : (5120, 256) (5120,)
90 / 1872	3.0 s	610.0 s	SHAPE : (5760, 256) (5760,)
100 / 1872	3.0 s	606.0 s	SHAPE : (6400, 256) (6400,)
110 / 1872	3.0 s	605.0 s	SHAPE : (7040, 256) (7040,)
120 / 1872	3.0 s	601.0 s	SHAPE : (7680, 256) (7680,)
130 / 1872	3.0 s	599.0 s	SHAPE : (8320, 256) (8320,)
140 / 1872	3.0 s	596.0 s	SHAPE : (8960, 256) (8960,)
150 / 1872	3.0 s	593.0 s	SHAPE : (9600, 256) (9600,)
160 / 1872	3.0 s	589.0 s	SHAPE : (10240, 256) (10240,)
170 / 1872	3.0 s	586.0 s	SHAPE : (10880, 256) (10880,)
180 / 1872	3.0 s	585.0 s	SHAPE : (11520, 256) (11520,)
190 / 1872	3.0 s	580.0 s	SHAP

In [None]:
import pickle
with open("/content/drive/MyDrive/HateSpeech/ELECTRA_TRAIN2.pickle","wb") as f :
  pickle.dump([temp,y_temp],f)

In [None]:
# save
i=0
start = time.time()
for pred in val_dataset.batch(64):
  if i%10==0 : 
    lef = len(df_val)/64
    print(i,'/',int(lef),end='\t')
    ela = time.time()-start
    print(round(ela,0),'s',end='\t')
    print(round(ela*(lef-i)/10,0),'s',end='\t')
    if i>0:print("SHAPE :",temp.shape,y_temp.shape)
    start = time.time()
  if i == 0:
    temp = model.predict(pred[0])[0][:,0,:]
    y_temp = pred[1]
    i+=1
  else :
    temp = np.append(temp,model.predict(pred[0])[0][:,0,:],axis=0)
    y_temp = np.append(y_temp,pred[1])
    i+=1
with open("/content/drive/MyDrive/HateSpeech/ELECTRA_VAL2.pickle","wb") as f :
  pickle.dump([temp,y_temp],f)

0 / 467	0.0 s	2.0 s	10 / 467	3.0 s	123.0 s	SHAPE : (640, 256) (640,)
20 / 467	3.0 s	121.0 s	SHAPE : (1280, 256) (1280,)
30 / 467	3.0 s	118.0 s	SHAPE : (1920, 256) (1920,)
40 / 467	3.0 s	120.0 s	SHAPE : (2560, 256) (2560,)
50 / 467	3.0 s	114.0 s	SHAPE : (3200, 256) (3200,)
60 / 467	3.0 s	111.0 s	SHAPE : (3840, 256) (3840,)
70 / 467	3.0 s	107.0 s	SHAPE : (4480, 256) (4480,)
80 / 467	3.0 s	104.0 s	SHAPE : (5120, 256) (5120,)
90 / 467	3.0 s	102.0 s	SHAPE : (5760, 256) (5760,)
100 / 467	3.0 s	99.0 s	SHAPE : (6400, 256) (6400,)
110 / 467	3.0 s	98.0 s	SHAPE : (7040, 256) (7040,)
120 / 467	3.0 s	95.0 s	SHAPE : (7680, 256) (7680,)
130 / 467	3.0 s	92.0 s	SHAPE : (8320, 256) (8320,)
140 / 467	3.0 s	89.0 s	SHAPE : (8960, 256) (8960,)
150 / 467	3.0 s	86.0 s	SHAPE : (9600, 256) (9600,)
160 / 467	3.0 s	84.0 s	SHAPE : (10240, 256) (10240,)
170 / 467	3.0 s	81.0 s	SHAPE : (10880, 256) (10880,)
180 / 467	3.0 s	79.0 s	SHAPE : (11520, 256) (11520,)
190 / 467	3.0 s	77.0 s	SHAPE : (12160, 256) (12160,)
200 /

In [None]:
# save
i=0
start = time.time()
for pred in test_dataset.batch(64):
  if i%10==0 : 
    lef = len(df_test)/64
    print(i,'/',int(lef),end='\t')
    ela = time.time()-start
    print(round(ela,0),'s',end='\t')
    print(round(ela*(lef-i)/10,0),'s',end='\t')
    if i>0:print("SHAPE :",temp.shape,y_temp.shape)
    start = time.time()
  if i == 0:
    temp = model.predict(pred[0])[0][:,0,:]
    y_temp = pred[1]
    i+=1
  else :
    temp = np.append(temp,model.predict(pred[0])[0][:,0,:],axis=0)
    y_temp = np.append(y_temp,pred[1])
    i+=1
with open("/content/drive/MyDrive/HateSpeech/ELECTRA_TEST2.pickle","wb") as f :
  pickle.dump([temp,y_temp],f)

0 / 239	0.0 s	0.0 s	

NotFittedError: ignored

### LOAD EMBEDDED DATASET

In [None]:
# load
import pickle
with open("/content/drive/MyDrive/HateSpeech/test.pickle","rb") as f :
  test_hs = pickle.load(f)
with open("/content/drive/MyDrive/HateSpeech/val.pickle","rb") as f :
  val_hs = pickle.load(f)
with open("/content/drive/MyDrive/HateSpeech/train.pickle","rb") as f :
  train_hs = pickle.load(f)

In [None]:
with open("/content/drive/MyDrive/HateSpeech/ELECTRA_TRAIN2.pickle","rb") as f :
  train_hs = pickle.load(f)
with open("/content/drive/MyDrive/HateSpeech/ELECTRA_VAL2.pickle","rb") as f :
  val_hs = pickle.load(f)

In [None]:
for hs in [train_hs,val_hs]:
  print(hs[0].shape,hs[1].shape)

(119868, 256) (119868,)
(29931, 256) (29931,)


In [None]:
119868+29931

149799

## MachinLearning Fit & Evaluate

### Logistic,RF,LGBM

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
model_list = []
model_list.append([LogisticRegression(),'LogisticRegression'])
model_list.append([RandomForestClassifier(max_depth=None,min_samples_split=4,n_estimators=100,oob_score=False,n_jobs=-1,verbose=10),
                  'RandomForestClassifier'])
model_list.append([LGBMClassifier(),'LGBMClassifier'])
model_list

[[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False), 'LogisticRegression'],
 [RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=4,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=None,
                         verbose=10, warm_start=False),
  'RandomForestClassifier'],
 [LGBMClassifier(boosting_type='gbdt', class_wei

### Evaluation

In [None]:
eval = pd.DataFrame([[np.nan for i in range(11)]])
eval.columns = ['Model',
                'Train_Score(ACC)','Train_Score(ROC_AUC)','Train_Score(F1)',
                'Val_Score(ACC)','Val_Score(ROC_AUC)','Val_Score(F1)',
                'Test_Score(ACC)','Test_Score(ROC_AUC)','Test_Score(F1)','Inference_Time']
eval = eval.iloc[1:]

for model, model_name in model_list:
  data_start = time.time()
  X_train = train_hs[0]
  y_train = train_hs[1]
  X_val = val_hs[0]
  y_val = val_hs[1]
  X_test = test_hs[0]
  y_test = test_hs[1]
  # Fit
  LR = model
  LR.fit(X_train,y_train)

  # Inference
  print("TRAIN SET")
  fitted = LR.predict(X_train)
  fitted_proba = LR.predict_proba(X_train)

  print("VAL SET")
  val_pred = LR.predict(X_val)
  val_pred_proba = LR.predict_proba(X_val)

  print("TEST SET")
  start = time.time()
  test_pred = LR.predict(X_test)
  inference_time = time.time()-start
  test_pred_proba = LR.predict_proba(X_test)
  print(f"Inferenced : {inference_time}s",end='\t')

  # Evaluate
  train_acc = accuracy_score(y_train,fitted)
  train_auc = roc_auc_score(y_train,fitted_proba[:,1])
  train_f1 = f1_score(y_train,fitted)

  val_acc = accuracy_score(y_val,val_pred)
  val_auc = roc_auc_score(y_val,val_pred_proba[:,1])
  val_f1 = f1_score(y_val,val_pred)

  test_acc = accuracy_score(y_test,test_pred)
  test_auc = roc_auc_score(y_test,test_pred_proba[:,1])
  test_f1 = f1_score(y_test,test_pred)
  print(f"train ROCAUC : {train_auc} val ROCAUC : {val_auc} test ROCAUC : {test_auc} ")

  LR_list = [model_name]
  LR_list.append(train_acc)
  LR_list.append(train_auc)
  LR_list.append(train_f1)
  LR_list.append(val_acc)
  LR_list.append(val_auc)
  LR_list.append(val_f1)
  LR_list.append(test_acc)
  LR_list.append(test_auc)
  LR_list.append(test_f1)
  LR_list.append(inference_time)

  eval = eval.append(pd.DataFrame([LR_list],columns=eval.columns))
  print(f"{time.time()-data_start}")

NameError: ignored

In [None]:
eval

Unnamed: 0,Model,Train_Score(ACC),Train_Score(ROC_AUC),Train_Score(F1),Val_Score(ACC),Val_Score(ROC_AUC),Val_Score(F1),Test_Score(ACC),Test_Score(ROC_AUC),Test_Score(F1),Inference_Time
0,LogisticRegression_BERT_EMBED,0.842627,0.921017,0.8451,0.842371,0.920272,0.844208,0.360042,0.502175,0.383998,0.048613
0,RandomForestClassifier_BERT_EMBED,0.998974,0.999996,0.99899,0.823527,0.9031,0.824437,0.405641,0.499277,0.376691,0.123162
0,LGBMClassifier_BERT_EMBED,0.855266,0.933154,0.857789,0.834887,0.91523,0.836876,0.37991,0.499551,0.382324,0.016189


In [None]:
eval.to_csv(f"/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_BERT.csv")

# 3 Build Bert Embedded Dataset (SqueezeBERT)

## BERT TOKENIZER

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("squeezebert/squeezebert-uncased")
model = AutoModel.from_pretrained("squeezebert/squeezebert-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=500.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231580.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466182.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=103473649.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### BUILD DATASET

In [None]:
import time

X_train = df_train['text'].to_list()
y_train = df_train['class'].to_list()
X_val = df_val['text'].to_list()
y_val = df_val['class'].to_list()
X_test = df_test['text'].to_list()
y_test = df_test['class'].to_list()

# Tokenizing
encoding_time = time.time()
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)
print("encoding time : ", time.time()-encoding_time)

encoding time :  42.50465726852417


In [None]:
# Build Dataset
dataset_time = time.time()
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
print("building dataset : ", time.time()-dataset_time)
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))
print("building dataset : ", time.time()-dataset_time)

building dataset :  418.4919927120209
building dataset :  506.6187915802002


## EMBEDDING

### SAVE EMBEDDED DATASET

In [None]:
import torch

# save
i=0
start = time.time()
for pred in train_dataset.batch(64):
  if i%10==0 : 
    lef = len(df_train)/64
    print(i,'/',int(lef),end='\t')
    ela = time.time()-start
    print(round(ela,0),'s',end='\t')
    print(round(ela*(lef-i)/10,0),'s',end='\t')
    if i>0: print("SHAPE :",temp.shape,y_temp.shape)
    start = time.time()
  if i == 0:
    with torch.no_grad():
        temp = model(torch.tensor(np.array(pred[0]['input_ids'])))[0][:,0,:]
    y_temp = pred[1]
    i+=1
  else :
    with torch.no_grad():
        temp_input = model(torch.tensor(np.array(pred[0]['input_ids'])))[0][:,0,:]
    temp = np.append(temp,temp_input,axis=0)
    y_temp = np.append(y_temp,pred[1])
    i+=1

In [None]:
import pickle
with open("/content/drive/MyDrive/HateSpeech/train_squeeze.pickle","wb") as f :
  pickle.dump([temp,y_temp],f)

In [None]:
import torch

# save
start = time.time()
for ds in [[val_dataset,'val'],[test_dataset,'test']]:
  i=0
  for pred in ds[0].batch(64):
    if i%10==0 : 
      lef = len(df_train)/64
      print(i,'/',int(lef),end='\t')
      ela = time.time()-start
      print(round(ela,0),'s',end='\t')
      print(round(ela*(lef-i)/10,0),'s',end='\t')
      if i>0: print("SHAPE :",temp.shape,y_temp.shape)
      start = time.time()
    if i == 0:
      with torch.no_grad():
          temp = model(torch.tensor(np.array(pred[0]['input_ids'])))[0][:,0,:]
      y_temp = pred[1]
      i+=1
    else :
      with torch.no_grad():
          temp_input = model(torch.tensor(np.array(pred[0]['input_ids'])))[0][:,0,:]
      temp = np.append(temp,temp_input,axis=0)
      y_temp = np.append(y_temp,pred[1])
      i+=1
  with open(f"/content/drive/MyDrive/HateSpeech/{ds[1]}_squeeze.pickle","wb") as f :
    pickle.dump([temp,y_temp],f)

### LOAD EMBEDDED DATASET

In [None]:
# load
import pickle
with open("/content/drive/MyDrive/HateSpeech/test.pickle","rb") as f :
  test_hs = pickle.load(f)
with open("/content/drive/MyDrive/HateSpeech/val.pickle","rb") as f :
  val_hs = pickle.load(f)
with open("/content/drive/MyDrive/HateSpeech/train.pickle","rb") as f :
  train_hs = pickle.load(f)

## MachinLearning Fit & Evaluate

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
model_list = []
model_list.append([LogisticRegression(),'LogisticRegression'])
model_list.append([RandomForestClassifier(max_depth=None,min_samples_split=4,n_estimators=100,oob_score=False,n_jobs=-1,verbose=10),
                  'RandomForestClassifier'])
model_list.append([LGBMClassifier(),'LGBMClassifier'])
model_list

[[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False), 'LogisticRegression'],
 [RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=4,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=None,
                         verbose=10, warm_start=False),
  'RandomForestClassifier'],
 [LGBMClassifier(boosting_type='gbdt', class_wei

In [None]:
eval = pd.DataFrame([[np.nan for i in range(11)]])
eval.columns = ['Model',
                'Train_Score(ACC)','Train_Score(ROC_AUC)','Train_Score(F1)',
                'Val_Score(ACC)','Val_Score(ROC_AUC)','Val_Score(F1)',
                'Test_Score(ACC)','Test_Score(ROC_AUC)','Test_Score(F1)','Inference_Time']
eval = eval.iloc[1:]

for model, model_name in model_list:
  data_start = time.time()
  X_train = train_hs[0]
  y_train = train_hs[1]
  X_val = val_hs[0]
  y_val = val_hs[1]
  # X_test = test_hs[0]
  # y_test = test_hs[1]
  # Fit
  LR = model
  LR.fit(X_train,y_train)

  # Inference
  print("TRAIN SET")
  fitted = LR.predict(X_train)
  fitted_proba = LR.predict_proba(X_train)

  print("VAL SET")
  val_pred = LR.predict(X_val)
  val_pred_proba = LR.predict_proba(X_val)

  # print("TEST SET")
  # start = time.time()
  # test_pred = LR.predict(X_test)
  # inference_time = time.time()-start
  # test_pred_proba = LR.predict_proba(X_test)
  # print(f"Inferenced : {inference_time}s",end='\t')

  # Evaluate
  train_acc = accuracy_score(y_train,fitted)
  train_auc = roc_auc_score(y_train,fitted_proba[:,1])
  train_f1 = f1_score(y_train,fitted)

  val_acc = accuracy_score(y_val,val_pred)
  val_auc = roc_auc_score(y_val,val_pred_proba[:,1])
  val_f1 = f1_score(y_val,val_pred)

  # test_acc = accuracy_score(y_test,test_pred)
  # test_auc = roc_auc_score(y_test,test_pred_proba[:,1])
  # test_f1 = f1_score(y_test,test_pred)
  print(f"train ROCAUC : {train_auc} val ROCAUC : {val_auc} test ROCAUC :  ")
  LR_list = [model_name+"_ELECTRA"]
  LR_list.append(train_acc)
  LR_list.append(train_auc)
  LR_list.append(train_f1)
  LR_list.append(val_acc)
  LR_list.append(val_auc)
  LR_list.append(val_f1)
  LR_list.append(0)
  LR_list.append(0)
  LR_list.append(0)
  LR_list.append(0)

  filename = f'/content/drive/MyDrive/HateSpeech/Weight/ML/{model_name}_electra.sav'
  pickle.dump(LR, open(filename, 'wb'))
  eval = eval.append(pd.DataFrame([LR_list],columns=eval.columns))
  print(f"{time.time()-data_start}")

TRAIN SET
VAL SET
train ROCAUC : 0.9711075531942035 val ROCAUC : 0.9624789457679603 test ROCAUC :  
4.3477537631988525
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.1s


building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.2s


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.8s


building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   28.9s


building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.5s


building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   51.7s


building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.1min


building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.6min


building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.8min


building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s


TRAIN SET


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0

VAL SET


[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0

train ROCAUC : 0.9999838168286036 val ROCAUC : 0.9604219604164584 test ROCAUC :  
123.03383088111877
TRAIN SET
VAL SET
train ROCAUC : 0.9788533738482479 val ROCAUC : 0.9625013023869374 test ROCAUC :  
14.636789083480835


In [None]:
eval

Unnamed: 0,Model,Train_Score(ACC),Train_Score(ROC_AUC),Train_Score(F1),Val_Score(ACC),Val_Score(ROC_AUC),Val_Score(F1),Test_Score(ACC),Test_Score(ROC_AUC),Test_Score(F1),Inference_Time
0,LogisticRegression_ELECTRA,0.910819,0.971108,0.912469,0.894725,0.962479,0.896427,0.0,0.0,0.0,0.0
0,RandomForestClassifier_ELECTRA,0.998231,0.999984,0.998259,0.895326,0.960422,0.896863,0.0,0.0,0.0,0.0
0,LGBMClassifier_ELECTRA,0.917626,0.978853,0.919145,0.896328,0.962501,0.897978,0.0,0.0,0.0,0.0


In [None]:
eval.to_csv(f"/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_ELELECTRA.csv")

## Distil

# 4 Evaluation

In [None]:
import glob
llist = glob.glob("/content/drive/MyDrive/HateSpeech/PERFORMANCE2/*.csv")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
llist

['/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ELECTRA.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/DistilBERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/RoBERTa.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_BERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/GloVe_BiLSTM.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_GloVe.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_SQUEEZE.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/MobileBERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/Benchmarks.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/Benchmarks_std.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_ELELECTRA.csv']

In [None]:
llist.pop(-2)
llist.pop(-2)
llist

['/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ELECTRA.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/DistilBERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/RoBERTa.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_BERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/GloVe_BiLSTM.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_GloVe.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_SQUEEZE.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/MobileBERT.csv',
 '/content/drive/MyDrive/HateSpeech/PERFORMANCE2/ML_ELELECTRA.csv']

In [None]:
import pandas as pd
temp = pd.read_csv(llist[0])
for ll in llist[1:]:
  temp = pd.concat([temp,pd.read_csv(ll)])
temp = temp.iloc[:,1:]

In [None]:
temp['Model'].unique()

array(['ELECTRA', 'DistilBERT', 'RoBERTa',
       'LogisticRegression_BERT_EMBED',
       'RandomForestClassifier_BERT_EMBED', 'LGBMClassifier_BERT_EMBED',
       'BiLSTM+GloVe(10)', 'RandomForestClassifier_GloVe_EMBED',
       'LGBMClassifier_GloVe_EMBED', 'LogisticRegression_SQUEEZE',
       'RandomForestClassifier_SQUEEZE', 'LGBMClassifier_SQUEEZE',
       'MobileBERT', 'LogisticRegression_ELECTRA',
       'RandomForestClassifier_ELECTRA', 'LGBMClassifier_ELECTRA'],
      dtype=object)

In [None]:
temp.sort_values('Val_Score(ROC_AUC)',ascending=False)[['Model','Val_Score(ROC_AUC)','Val_Score(F1)','Inference_Time']].dropna()

Unnamed: 0,Model,Val_Score(ROC_AUC),Val_Score(F1),Inference_Time
2,LGBMClassifier_ELECTRA,0.962501,0.897978,0.0
0,LogisticRegression_ELECTRA,0.962479,0.896427,0.0
0,ELECTRA,0.960681,0.896048,30.033314
1,RandomForestClassifier_ELECTRA,0.960232,0.895416,0.0
0,DistilBERT,0.952547,0.879108,28.073447
0,BiLSTM+GloVe(10),0.936699,0.864514,8.134799
0,LogisticRegression_BERT_EMBED,0.920272,0.844208,0.048613
0,LogisticRegression_SQUEEZE,0.920272,0.844208,0.043455
2,LGBMClassifier_BERT_EMBED,0.91523,0.836876,0.016189
2,LGBMClassifier_SQUEEZE,0.91523,0.836876,0.014264
