#ACCURACY TESTS

Once the data is well divided and clean, the accuracy tests can take place. In this notebook we find the code for running each kind of test, divided by type of model (Naive Bayes, Support Vector Machines, Logistic Regression). There is no output in this notebook per se, but after running the code different .txt files with accuracy values will be found in an organised structure in Drive.

*Note: When saving or loading data from Drive, the paths are specific to my personal Drive*

In [1]:
#Imports
!pip install emoji
import emoji
from sklearn.model_selection import train_test_split 
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, plot_confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 5.0 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=85f9f257c9412152cd464a97a41aef3406821c3326a775596b1cca780c338cc3
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.0.0


In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


#NAIVE BAYES


In [3]:
def naive_bayes(i, balance, lang, emo_hash, test, nbtype, vectorizer):
  if i == 0:
    i = 1
  if balance:
    #we save our txt files in a general path, I later save them into their specific folders
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/NB_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + nbtype + "_" + vectorizer + "_BALANCED.txt"
  else:
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/NB_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + nbtype + "_" + vectorizer + "_NOT_BALANCED.txt"
  for x in range(i):
    #Data loading---------------------------------GENERAL--CODE-----------------------------------------------
    df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/' + emo_hash + '_' + lang + '_data.csv', encoding='utf8', engine='python')
    #Final row cleansing
    df = df[(df['hate speech'] == 0) | (df['hate speech'] == 1)]
    df = df.dropna()
    df = shuffle(df)
    #Balancing data
    if balance: 
      pos_rows = len(df[df["hate speech"] == True].index)
      fraction_to_delete = 1 - (pos_rows/ (df.shape[0]-pos_rows))
      df = df.drop(df[df['hate speech'] == 0].sample(frac=fraction_to_delete).index)
    #Train and test split
    X, y = df.text.fillna(' '), df["hate speech"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test)
    #Vectorizing
    with open("/content/drive/MyDrive/TFG/data/stopwords/" + lang + "_stopwords.json", "r") as f:
      json_text = f.read()
    stopwords = list(json.loads(json_text))
    if vectorizer == "CountVectorizer":
      vect = CountVectorizer(stop_words = stopwords, binary = True)
    else:
      vect = TfidfVectorizer(stop_words = stopwords, binary = True) # tfidf here
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    #print(len(vect.get_feature_names_out())) #code to calculate average vocabulary of train data
    #Model building---------------------------------GENERAL--CODE-----------------------------------------------
    if nbtype == "Bernoulli":
      model = BernoulliNB()
    else:
      model = MultinomialNB()
    model.fit(X_train_vect, y_train)
    #Result printing
    y_pred = model.predict(X_test_vect)
    acc_file = open(path,"a") 
    acc_file.write(str(accuracy_score(y_test, y_pred) * 100) + "\n")
    print("Accuracy score for Naive Bayes is: ", accuracy_score(y_test, y_pred) * 100, '%')
    acc_file.close()
  return model, vect

#SUPPORT VECTOR MACHINES
Using SVM classifiers for text classification tasks might be a really good idea, especially if the training data available is not much (~ a couple of thousand tagged samples).

In [4]:
def support_vector_machine(i, balance, lang, emo_hash, test, kernel, c, vectorizer):
  if i == 0:
    i = 1
  if balance:
    #we save our txt files in a general path, I later save them into their specific folders
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/SVM_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + kernel + "Kernel_" + str(c) + "C_" + vectorizer + "_BALANCED.txt"
  else:
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/SVM_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + kernel + "Kernel_" + str(c) + "C_" + vectorizer + "_NOT_BALANCED.txt"
  for x in range(i):
    #Data loading---------------------------------GENERAL--CODE-----------------------------------------------
    df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/' + emo_hash + '_' + lang + '_data.csv', encoding='utf8', engine='python')
    #Final row cleansing
    df = df[(df['hate speech'] == 0) | (df['hate speech'] == 1)]
    df = df.dropna()
    df = shuffle(df)
    #Balancing data
    if balance: 
      pos_rows = len(df[df["hate speech"] == True].index)
      fraction_to_delete = 1 - (pos_rows/ (df.shape[0]-pos_rows))
      df = df.drop(df[df['hate speech'] == 0].sample(frac=fraction_to_delete).index)
    #Train and test split
    X, y = df.text.fillna(' '), df["hate speech"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test)
    #Vectorizing
    with open("/content/drive/MyDrive/TFG/data/stopwords/" + lang + "_stopwords.json", "r") as f:
      json_text = f.read()
    stopwords = list(json.loads(json_text))
    if vectorizer == "CountVectorizer":
      vect = CountVectorizer(stop_words = stopwords, binary = True)
    else:
      vect = TfidfVectorizer(stop_words = stopwords, binary = True) # tfidf here
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    #Model building---------------------------------GENERAL--CODE-----------------------------------------------
    model = SVC(kernel=kernel, C=c)
    model.fit(X_train_vect, y_train)
    #Result printing
    y_pred = model.predict(X_test_vect)
    acc_file = open(path,"a") 
    acc_file.write(str(accuracy_score(y_test, y_pred) * 100) + "\n")
    print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred) * 100, '%')
    acc_file.close()
  return model, vect

#LOGISTIC REGRESSION


In [5]:
def logistic_regression(i, balance, lang, emo_hash, test, solver, c, vectorizer):
  if i == 0:
    i = 1
  if balance:
    #we save our txt files in a general path, I later save them into their specific folders
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/LR_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + solver + "Solver_" + str(c) + "C_" + vectorizer + "_BALANCED.txt"
  else:
    path = "/content/drive/MyDrive/TFG/data/accuracy_data/LR_" + lang + "_" + emo_hash + "_" + str(100-(test*100)) + str(test*100) +"TrainTest_" + solver + "Solver_" + str(c) + "C_" + vectorizer + "_NOT_BALANCED.txt"
  #"w" to write, "a" to append
  for x in range(i):
    #Data loading---------------------------------GENERAL--CODE-----------------------------------------------
    df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/' + emo_hash + '_' + lang + '_data.csv', encoding='utf8', engine='python')
    #Final row cleansing
    df = df[(df['hate speech'] == 0) | (df['hate speech'] == 1)]
    df = df.dropna()
    df = shuffle(df)
    #Balancing data
    if balance: 
      pos_rows = len(df[df["hate speech"] == True].index)
      fraction_to_delete = 1 - (pos_rows/ (df.shape[0]-pos_rows))
      df = df.drop(df[df['hate speech'] == 0].sample(frac=fraction_to_delete).index)
    #Train and test split
    X, y = df.text.fillna(' '), df["hate speech"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test)
    #Vectorizing
    with open("/content/drive/MyDrive/TFG/data/stopwords/" + lang + "_stopwords.json", "r") as f:
      json_text = f.read()
    stopwords = list(json.loads(json_text))
    if vectorizer == "CountVectorizer":
      vect = CountVectorizer(stop_words = stopwords, binary = True)
    else:
      vect = TfidfVectorizer(stop_words = stopwords, binary = True) # tfidf here
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    #Model building---------------------------------GENERAL--CODE-----------------------------------------------
    model = LogisticRegression(solver=solver, C=c)
    model.fit(X_train_vect, y_train)
    #Result printing
    y_pred = model.predict(X_test_vect)
    acc_file = open(path,"a") 
    acc_file.write(str(accuracy_score(y_test, y_pred) * 100) + "\n")
    print("Accuracy score for Logistic Regression is: ", accuracy_score(y_test, y_pred) * 100, '%')
    acc_file.close()
  return model, vect

------------------------------------------------------------------------
#Accuracy tests

In [6]:
#@title 1. Choose a model and language

Language = 'English'  #@param ["Spanish", "Italian", "Portuguese", "English"]

map_lang_data = {
    'Spanish':
        'spanish',
    'Italian':
        'italian',
    'Portuguese':
        'portuguese',
    'English':
        'english',
}

Emojis_Hashtags = 'All emojis and hashtags'  #@param ["All emojis and hashtags", "No emojis or hashtags"]

map_emojhash_data = {
    'All emojis and hashtags':
        'mantained',
    'No emojis or hashtags':
        'removed',
}

Train_Test_Split = '70/30'  #@param ['60/40', '70/30', '80/20']

map_test_split = {
    '60/40':
        0.4,
    '70/30':
        0.3,
    '80/20':
        0.2,
}

Model = 'Multinomial NB'  #@param ["Bernoulli NB", "Multinomial NB", "SVM linear kernel small C", "SVM linear kernel standard C", "SVM linear kernel large C", "SVM RBF kernel small C", "SVM RBF kernel standard C", "SVM RBF kernel large C", "LR liblinear solver small C", "LR liblinear solver standard C","LR liblinear solver large C", "LR lbfgs solver small C", "LR lbfgs solver standard C", "LR lbfgs solver large C"]

TF_IDF = True #@param {type:"boolean"}
Test_Iterations = 1 #@param {type:"slider", min:1, max:50, step:1}
Balance_data = True #@param {type:"boolean"}

map_vectorizer = {
    True:
        "TfidfVectorizer",
    False:
        "CountVectorizer",
}

lang = map_lang_data[Language]
emo_hash = map_emojhash_data[Emojis_Hashtags]
test = map_test_split[Train_Test_Split]
vectorizer = map_vectorizer[TF_IDF]
if TF_IDF:
  vectorizer_word = "with"
else:
  vectorizer_word = "without"

print((Language + " data with " + Emojis_Hashtags +" using model " + Model + ", " + vectorizer_word + " TF-IDF and train/test split of " + Train_Test_Split).upper())
print("-------------------------------------------------------------------------------------------------------------------------------")

if Model == "SVM linear kernel small C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "linear", 0.1, vectorizer)
elif Model == "SVM linear kernel standard C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "linear", 1, vectorizer)
elif Model == "SVM linear kernel large C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "linear", 10, vectorizer)
elif Model == "SVM RBF kernel small C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "rbf", 0.1, vectorizer)
elif Model == "SVM RBF kernel standard C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "rbf", 1, vectorizer)
elif Model == "SVM RBF kernel large C":
  chosen_model, vect = support_vector_machine(Test_Iterations, Balance_data, lang, emo_hash, test, "rbf", 10, vectorizer)

elif Model == "Bernoulli NB":
  chosen_model, vect = naive_bayes(Test_Iterations, Balance_data, lang, emo_hash, test, "Bernoulli", vectorizer)
elif Model == "Multinomial NB":
  chosen_model, vect = naive_bayes(Test_Iterations, Balance_data, lang, emo_hash, test, "Multinomial", vectorizer)

elif Model == "LR liblinear solver small C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "liblinear", 0.1, vectorizer)
elif Model == "LR liblinear solver standard C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "liblinear", 1, vectorizer)
elif Model == "LR liblinear solver large C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "liblinear", 10, vectorizer)
elif Model == "LR lbfgs solver small C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "lbfgs", 0.1, vectorizer)
elif Model == "LR lbfgs solver standard C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "lbfgs", 1, vectorizer)
elif Model == "LR lbfgs solver large C":
  chosen_model, vect = logistic_regression(Test_Iterations, Balance_data, lang, emo_hash, test, "lbfgs", 10, vectorizer)

ENGLISH DATA WITH ALL EMOJIS AND HASHTAGS USING MODEL MULTINOMIAL NB, WITH TF-IDF AND TRAIN/TEST SPLIT OF 70/30
-------------------------------------------------------------------------------------------------------------------------------
