In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.util import pr
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_1 = pd.read_csv("/content/drive/MyDrive/cyberbullying_tweets.csv") # https://www.kaggle.com/datasets/shauryapanpalia/cyberbullying-classification
data_2 = pd.read_csv("/content/drive/MyDrive/Hate Speech and Offensive Language Detection on Twitter.csv") # https://www.kaggle.com/datasets/thedevastator/hate-speech-and-offensive-language-detection
data_3 = pd.read_csv("/content/drive/MyDrive/Suspicious Communication on Social Platforms.csv") # https://www.kaggle.com/datasets/syedabbasraza/suspicious-communication-on-social-platforms

Label description:
0 -> safe
1 -> offensive
2 -> hate


In [None]:
data_1 = data_1.rename(columns = {"headline": "message"})
data_1["label"] = data_1["label"].map({-1: 1, 0: 0})
data_1.head()

Unnamed: 0,message,label
0,cock suck before you piss around on my work,1
1,you are gay or antisemmitian archangel white ...,1
2,fuck your filthy mother in the ass dry,1
3,get fuck ed up get fuck ed up got a drink t...,1
4,stupid peace of shit stop deleting my stuff ...,1


In [None]:
data_2.loc[(data_2["hate_speech_count"] > 0) | (data_2["offensive_language_count"] > 0), "label"] = 1
data_2.loc[(data_2["hate_speech_count"] == 0) & (data_2["offensive_language_count"] == 0), "label"] = 0
data_2 = data_2.rename(columns = {"tweet": "message"})
data_2.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,message,label
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0.0
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1.0
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1.0
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0


In [None]:
data_3 = data_3.rename(columns = {"comments": "message", "tagging": "label"})
data_3.head()

Unnamed: 0,message,label
0,Get fucking real dude.,1
1,She is as dirty as they come and that crook ...,1
2,why did you fuck it up. I could do it all day...,1
3,Dude they dont finish enclosing the fucking s...,1
4,WTF are you talking about Men? No men thats n...,1


In [None]:
data_4 = pd.concat([data_1, data_2, data_3])

In [None]:
data_1.describe()

Unnamed: 0,label
count,18148.0
mean,0.64255
std,0.479262
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [None]:
data_2.describe()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,label
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,3.243473,0.280515,2.413711,0.549247,1.110277,0.884114
std,0.88306,0.631851,1.399459,1.113299,0.462089,0.320094
min,3.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,2.0,0.0,1.0,1.0
50%,3.0,0.0,3.0,0.0,1.0,1.0
75%,3.0,0.0,3.0,0.0,1.0,1.0
max,9.0,7.0,9.0,9.0,2.0,1.0


In [None]:
data_3.describe()

Unnamed: 0,label
count,20001.0
mean,0.39108
std,0.488005
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
data_4.describe()

Unnamed: 0,label,count,hate_speech_count,offensive_language_count,neither_count,class
count,62932.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,0.657758,3.243473,0.280515,2.413711,0.549247,1.110277
std,0.474464,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,0.0,3.0,0.0,2.0,0.0,1.0
50%,1.0,3.0,0.0,3.0,0.0,1.0
75%,1.0,3.0,0.0,3.0,0.0,1.0
max,1.0,9.0,7.0,9.0,9.0,2.0


In [None]:
def clean(text):
  text = str(text).lower()
  text = re.sub("\[.*?\]", "", text)
  text = re.sub("https?://\S+|www\.\S+", "", text)
  text = re.sub(r"rt ", "", text)
  text = re.sub("<.*?>+", "", text)
  text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
  text = re.sub("\n", "", text)
  text = re.sub("\w*\d\w*", "", text)
  text = [word for word in text.split(" ") if word not in stopword]
  text = " ".join(text)
  return text

In [None]:
cleanDatasets = [data_1, data_2, data_3, data_4]

for data in cleanDatasets:
  data["message"] = data["message"].apply(clean)

In [None]:
XtrainArr = []
XtestArr = []
YtrainArr = []
YtestArr = []

In [None]:
from imblearn.over_sampling import RandomOverSampler

for data in cleanDatasets:
  X = np.array(data["message"])
  Y = np.array(data["label"])

  cv = CountVectorizer()
  rus = RandomOverSampler(random_state=42, sampling_strategy=1)

  X = cv.fit_transform(X)
  X, Y = rus.fit_resample(X, Y)

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

  XtrainArr.append(X_train)
  XtestArr.append(X_test)
  YtrainArr.append(Y_train)
  YtestArr.append(Y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report

def test_accuracy(model, idx, convertArray="no"):
  print("\tRESULT:")
  if convertArray == "yes":
    pred_train = model.predict(XtrainArr[idx].toarray())
    pred_test = model.predict(XtestArr[idx].toarray())
  else:
    pred_train = model.predict(XtrainArr[idx])
    pred_test = model.predict(XtestArr[idx])
  print("\t\ttrain:", accuracy_score(YtrainArr[idx], pred_train))
  print("\t\ttest:", accuracy_score(YtestArr[idx], pred_test))
  print("\t\tconfussion matrix:")
  print(confusion_matrix(YtestArr[idx], pred_test))
  print("\t\tclassification report:")
  print(classification_report(YtestArr[idx], pred_test))

for i in range(len(XtrainArr)):
  print("TRAIN DATASET", i+1)
  print("\t================================")
  print("\tRANDOM FOREST")
  rfc = RandomForestClassifier()
  rfc.fit(XtrainArr[i], YtrainArr[i])
  test_accuracy(rfc, i)
  print("\t================================")
  print("\tSVM")
  svm = SVC()
  svm.fit(XtrainArr[i], YtrainArr[i])
  test_accuracy(svm, i)
  print("\t================================")
  print("\tNAIVE BAYES")
  naive = GaussianNB()
  naive.fit(XtrainArr[i].toarray(), YtrainArr[i])
  test_accuracy(naive, i, "yes")
  print("\t================================")
  print("\tLOGISTIC REGRESSION")
  lr = LogisticRegression(max_iter=10000)
  lr.fit(XtrainArr[i], YtrainArr[i])
  test_accuracy(lr, i)
  print("\t================================")

TRAIN DATASET 1
	RANDOM FOREST
	RESULT:
		train: 0.999552
		test: 0.9445238404573211
		confussion matrix:
[[3766   75]
 [ 352 3504]]
		classification report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      3841
           1       0.98      0.91      0.94      3856

    accuracy                           0.94      7697
   macro avg       0.95      0.94      0.94      7697
weighted avg       0.95      0.94      0.94      7697

	SVM
	RESULT:
		train: 0.859776
		test: 0.851110822398337
		confussion matrix:
[[3771   70]
 [1076 2780]]
		classification report:
              precision    recall  f1-score   support

           0       0.78      0.98      0.87      3841
           1       0.98      0.72      0.83      3856

    accuracy                           0.85      7697
   macro avg       0.88      0.85      0.85      7697
weighted avg       0.88      0.85      0.85      7697

	NAIVE BAYES
	RESULT:
		train: 0.88896
		test: 0.76887098

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

# rfc = RandomForestClassifier()
# rfc.fit(X_train, Y_train)

# pred_test = rfc.predict(X_test)
# pred_train = rfc.predict(X_train)

# print("test:", accuracy_score(Y_test, pred_test))
# print("train:", accuracy_score(Y_train, pred_train))
# print("f1 score:\n", f1_score(Y_test, pred_test, average=None))
# print("confussion matrix:\n", confusion_matrix(Y_test, pred_test))

In [None]:
# from sklearn.svm import SVC
# from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
# from sklearn.feature_extraction.text import CountVectorizer

# svm = SVC()
# svm.fit(X_train, Y_train)

# pred_test = svm.predict(X_test)
# pred_train = svm.predict(X_train)

# print("test:", accuracy_score(Y_test, pred_test))
# print("train:", accuracy_score(Y_train, pred_train))
# print("f1 score:\n", f1_score(Y_test, pred_test, average=None))
# print("confussion matrix:\n", confusion_matrix(Y_test, pred_test))

In [None]:
# msg = "Fuck you"
# df = cv.transform([msg]).toarray()
# svm.predict(df)

In [None]:
# from sklearn.naive_bayes import GaussianNB
# from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

# naive = GaussianNB()
# naive.fit(XtrainArr[4].toarray(), YtrainArr[4])

# pred_train = naive.predict(XtrainArr[4])
# pred_test = naive.predict(XtestArr[4])
# print("\t\ttrain:", accuracy_score(YtrainArr[4], pred_train))
# print("\t\ttest:", accuracy_score(YtestArr[4], pred_test))
# print("\t\tconfussion matrix:")
# print(confusion_matrix(YtestArr[4], pred_test))
# print("\t\tclassification report:")
# print(classification_report(YtestArr[4], pred_test))

# pred_test = naive.predict(X_test.toarray())
# pred_train = naive.predict(X_train.toarray())

# print("test:", accuracy_score(Y_test, pred_test))
# print("train:", accuracy_score(Y_train, pred_train))
# print("f1 score:\n", f1_score(Y_test, pred_test, average=None))
# print("confussion matrix:\n", confusion_matrix(Y_test, pred_test))

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

# lr = LogisticRegression(max_iter=10000)
# lr.fit(XtrainArr[4], YtrainArr[4])

# pred_train = lr.predict(XtrainArr[4])
# pred_test = lr.predict(XtestArr[4])
# print("\t\ttrain:", accuracy_score(YtrainArr[4], pred_train))
# print("\t\ttest:", accuracy_score(YtestArr[4], pred_test))
# print("\t\tconfussion matrix:")
# print(confusion_matrix(YtestArr[4], pred_test))
# print("\t\tclassification report:")
# print(classification_report(YtestArr[4], pred_test))

# pred_test = lr.predict(X_test)
# pred_train = lr.predict(X_train)

# print("test:", accuracy_score(Y_test, pred_test))
# print("train:", accuracy_score(Y_train, pred_train))
# print("f1 score:\n", f1_score(Y_test, pred_test, average=None))
# print("confussion matrix:\n", confusion_matrix(Y_test, pred_test))