# Baseline for HASOC
Code Reference: https://github.com/mdabashar/QutNocturnal-Hasoc2019/blob/master/CNN%20-%20Hate%20Speech%20and%20Offensive%20Content%20Identification%20In%20Hindi.ipynb

# Training the CNN model on collated data
- Checking accuracy on collated test data
- Checking accuracy on original HASOC test dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install simplemma

Collecting simplemma
  Downloading simplemma-0.9.1-py3-none-any.whl (75.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simplemma
Successfully installed simplemma-0.9.1


# Import common libraries

In [None]:

import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
import spacy
import re
import html
import simplemma

In [None]:
df=pd.DataFrame(pd.read_csv('hindi/hate-classifier-hindi.csv'))
train = df[df["uid"].str.contains("train")]
val = df[df["uid"].str.contains("val")]
test = df[df["uid"].str.contains("test")]

In [None]:
train.shape

(29317, 3)

In [None]:
val.shape

(7348, 3)

In [None]:
test.shape

(7992, 3)

In [None]:
train = pd.concat([train, val], axis= 1)
train.shape

(36665, 6)

In [None]:
train.to_csv('hindi_train.csv')
test.to_csv('hindi_test.csv')

# Initialise Random variables and Tensor Board

In [None]:
#SEED = 100
SEED = 123

#reference: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(SEED)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(SEED)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.random.set_seed(SEED)
sess = tf.compat.v1.Session(config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# Rest of code follows ...

# Preprocessing

In [None]:
re1 = re.compile(r' +')

def textFixup(aText):
    aText = aText.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ').replace('â€™', "'")
    return re1.sub(' ', html.unescape(aText))

In [None]:
!python -m spacy download xx_ent_wiki_sm

2023-11-16 09:01:40.613817: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 09:01:40.613877: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 09:01:40.613915: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting xx-ent-wiki-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.6.0/xx_ent_wiki_sm-3.6.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xx-ent-wiki-sm
Successfully instal

In [None]:
#! /usr/bin/env python3.1
''' Lightweight Hindi stemmer
Copyright © 2010 Luís Gomes <luismsgomes@gmail.com>.

Implementation of algorithm described in

    A Lightweight Stemmer for Hindi
    Ananthakrishnan Ramanathan and Durgesh D Rao
    http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf

    @conference{ramanathan2003lightweight,
      title={{A lightweight stemmer for Hindi}},
      author={Ramanathan, A. and Rao, D.},
      booktitle={Workshop on Computational Linguistics for South-Asian Languages, EACL},
      year={2003}
    }

Ported from HindiStemmer.java, part of of Lucene.
'''

suffixes = {
    1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
    2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
    3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
    4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
    5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}

def hi_stem(word):
    for L in 5, 4, 3, 2, 1:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                if word.endswith(suf):
                    return word[:-L]
    return word



In [None]:
hi_nlp = spacy.load("xx_ent_wiki_sm")

In [None]:
def preprocess_aTweet(tweet):
    tweet = tweet.lower()
    tweet = textFixup(tweet)


    tokens = [simplemma.lemmatize(str(token), lang='hi') for token in hi_nlp(tweet)]

    tokens = [hi_stem(t) for t in tokens]

    return ' '.join(tokens)

# Loading Data

In [None]:
def load_data_and_labels_csv(fileLoc):
    examples = []
    labels = []
    df = pd.read_csv(fileLoc)
    for i in df.index:
        examples.append(preprocess_aTweet(df['text'].astype(str)[i]))
        if int(df['label_yn'].fillna(0)[i])==1:
            labels.append(1)
        elif int(df['label_yn'].fillna(0)[i])==0:
            labels.append(0)
    return examples, labels

X_train, y_train = load_data_and_labels_csv('/content/drive/MyDrive/IRE/hindi_train.csv')

X_test, y_test = load_data_and_labels_csv('/content/drive/MyDrive/IRE/hindi_test.csv')


ytrain = np.array(y_train)
ytest = np.array(y_test)

# Transforming data suitable for model format

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

# Loading word embedding and mapping data to that word embedding

In [None]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('/content/drive/MyDrive/IRE/hi/vectors.txt')

embeddings_index = {}
for w in model_ug_cbow.wv.index_to_key:
    embeddings_index[w] = model_ug_cbow.wv[w]

embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Creating CNN model and training it for 10 epoc

In [None]:
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Input, concatenate, Activation
from keras.models import Model

def create_cnn_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')

    tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)

    tweet_encoder = Dropout(0.5)(tweet_encoder)
    bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D(data_format='channels_first')(bigram_branch)
    bigram_branch = Dropout(0.5)(bigram_branch)

    trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1,)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D(data_format='channels_first')(trigram_branch)
    trigram_branch = Dropout(0.2)(trigram_branch)

    fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1,)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D(data_format='channels_first')(fourgram_branch)
    fourgram_branch = Dropout(0.2)(fourgram_branch)

    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.5)(merged)

    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)

    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model()
cnn_model.fit(xtrain, ytrain, epochs=10, batch_size=32, verbose=3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78b3ac1a64d0>

In [None]:
from keras.models import load_model
cnn_model.save("CNN_HASOC.h5")


# Evaluating the model with test dataset

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

p = cnn_model.predict(xtest,verbose=1)


predicted = [int(round(x[0])) for x in p]
predicted = np.array(predicted)
actual = ytest

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)



accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

Accuracy	0.8483483483483484
Precision	0.8374968458238709
Recall	0.8538718806277334
f-measure	0.8456050955414013
cohen_kappa_score	0.6966283583511558
auc	0.8484950146135013
roc_auc	0.8484950146135013


## Testing on HASOC test dataset

In [None]:

X_test, y_test = load_data_and_labels_csv('/content/drive/MyDrive/IRE/hindi/hasoc_hi_t1_test.csv')
ytest = np.array(y_test)
xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

p = cnn_model.predict(xtest,verbose=1)


predicted = [int(round(x[0])) for x in p]
predicted = np.array(predicted)
actual = ytest

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)



accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

Accuracy	0.7185128983308042
Precision	0.6351039260969977
Recall	0.9090909090909091
f-measure	0.7477906186267844
cohen_kappa_score	0.4511526313308295
auc	0.7329465765650899
roc_auc	0.7329465765650899


## Testing on MACD test dataset

In [None]:

X_test, y_test = load_data_and_labels_csv('/content/drive/MyDrive/IRE/hindi/macd_hi_test.csv')
ytest = np.array(y_test)
xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

p = cnn_model.predict(xtest,verbose=1)


predicted = [int(round(x[0])) for x in p]
predicted = np.array(predicted)
actual = ytest

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)



accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

Accuracy	0.8307074910820452
Precision	0.8263638881196345
Recall	0.8535469107551488
f-measure	0.8397354720697904
cohen_kappa_score	0.6604366649379487
auc	0.8297746930013368
roc_auc	0.8297746930013368


In [None]:
loaded_model = load_model("network.h5")
loss, accuracy = loaded_model.evaluate(test_data, test_targets)

In [None]:
model_name = 'CNN'

In [None]:
import datetime
now = datetime.datetime.now()

out_string = '=========='+str(now)+'==============\n'
#out_string += 'Language:\t'+ lang+'\n'
out_string += 'Dataset:\t' + dataset_name + '\n'
out_string += 'Task:\t' + task + '\n'
out_string += str('Model Name:\t' + model_name+'\n')
out_string += '-------------------------------------------------' + '\n'

out_string += 'Total Samples:\t' + str(len(actual)) + '\n'
out_string += 'Positive Samples:\t' + str(sum(actual)) + '\n'
out_string += 'Negative Samples:\t' + str(len(actual)-sum(actual)) + '\n'

out_string += 'True Positive:\t' + str(tp) + '\n'
out_string += 'True Negative:\t' + str(tn) + '\n'
out_string += 'False Positive:\t' + str(fp) + '\n'
out_string += 'False Negative:\t' + str(fn) + '\n'

out_string += 'Accuracy:\t' + str(accuracy) + '\n'
out_string += 'Precision:\t' + str(precision) + '\n'
out_string += 'Recall:\t' + str(recall) + '\n'
out_string += 'F-measure:\t' + str(fmeasure) + '\n'
out_string += 'Cohen_Kappa_Score:\t' + str(cohen_kappa_score) + '\n'
out_string += 'AUC:\t' + str(auc_val) + '\n'
out_string += 'ROC_AUC:\t' + str(roc_auc_val) + '\n'
out_string += '\n'
out_string += classification_report(actual, predicted)
out_string += '\n'
print(out_string)
with open(model_name+'.txt', 'a+') as FO:
    FO.write(out_string)