In [None]:
! pip install fasttext



In [None]:
import tensorflow as tf
import csv
import random
import numpy as np
import pandas as pd
import io
import gzip

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

import fasttext
import fasttext.util

import sys

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import time

In [None]:
np.set_printoptions(threshold=sys.maxsize)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz

--2021-06-10 19:16:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 481477801 (459M) [binary/octet-stream]
Saving to: ‘cc.si.300.vec.gz.1’


2021-06-10 19:17:22 (19.0 MB/s) - ‘cc.si.300.vec.gz.1’ saved [481477801/481477801]

--2021-06-10 19:17:22--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3181346570 (3.0G) [application/octet-stream]
Saving to: ‘cc.si.300.bin.gz’


2021-06-10 19:19:58 (19.5 MB/s) - ‘cc.si.300.bin.gz’ saved [

In [None]:
!gzip -d cc.si.300.bin.gz

gzip: cc.si.300.bin already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [None]:
ft = fasttext.load_model('/content/cc.si.300.bin')
ft.get_dimension()



300

In [None]:
dataset = []
with open("/content/drive/MyDrive/sinhala-hate-speech-dataset.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
      label = row[2]
      post = row[1]
      item = []
      item.append(post)
      if label == '0':
        item.append(0)
      elif label == '1':
        item.append(1)
      else:
        print('ERROR') 
        continue  
      dataset.append(item)

ERROR


In [None]:
len(dataset)

6345

In [None]:
posts=[]
labels=[]
for x in range(len(dataset)):
    posts.append(dataset[x][0])
    labels.append(dataset[x][1])

In [None]:
vocab_size = 10000
embedding_dim = 300
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(posts)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(posts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

In [None]:
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = ft.get_word_vector(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
x_vecs = np.asarray([embeddings_matrix[seq].flatten() for seq in padded_sequences])

In [None]:
print(x_vecs.shape)

(6345, 36000)


In [None]:
pca_model = PCA(n_components=1800)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9327295275202638


In [None]:
x_comps = pca_model.transform(x_vecs)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_comps, labels ,test_size=0.2, random_state=42)

In [None]:
start = time.time() 

svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

Support Vector Machine Classifier has fitted, this process took 58.16 seconds


In [None]:
svm_classifier.score(x_test,y_test)

0.7919621749408984

In [None]:
start = time.time() 

naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Naive Bayes Classifier has fitted, this process took {} seconds".format(process))

Naive Bayes Classifier has fitted, this process took 0.15 seconds


In [None]:
naive_bayes_classifier.score(x_test,y_test)

0.5697399527186762

In [None]:
start = time.time() 

k_neighbors_classifier = KNeighborsClassifier(n_neighbors=5)
k_neighbors_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("K Neighbors Classifier has fitted, this process took {} seconds".format(process))

K Neighbors Classifier has fitted, this process took 1.17 seconds


In [None]:
k_neighbors_classifier.score(x_test,y_test)

0.7139479905437353

In [None]:
start = time.time() 

logistic_regression_classifier = LogisticRegression()
logistic_regression_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Logistic Regression Classifier has fitted, this process took {} seconds".format(process))

Logistic Regression Classifier has fitted, this process took 1.01 seconds


In [None]:
logistic_regression_classifier.score(x_test,y_test)

0.817966903073286

In [None]:
start = time.time() 

decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Decision Tree Classifier has fitted, this process took {} seconds".format(process))

Decision Tree Classifier has fitted, this process took 11.81 seconds


In [None]:
decision_tree_classifier.score(x_test,y_test)

0.6635145784081954

In [None]:
#testing model with sample text
def hate_speech(post, classifier, embeddings_matrix, pca_model):
  post_sequence = tokenizer.texts_to_sequences(post)
  padded_post_sequence = pad_sequences(post_sequence, 
                                       maxlen=max_length, 
                                       truncating=trunc_type)
  x_vec = np.asarray([embeddings_matrix[padded_post_sequence].flatten()])
  x_comp = pca_model.transform(x_vec)
  post_prediction = classifier.predict(x_comp)
  label = post_prediction.round().item()
  if label == 0:
    print("%s : This is NOT Hate speech" % post)
  elif label == 1:
    print("%s : This is Hate speech" % post)

In [None]:
hate_speech(['පලයන් තම්බියා'], logistic_regression_classifier, embeddings_matrix, pca_model)

['පලයන් තම්බියා'] : This is Hate speech
