In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

total_of = [] # all offensive
total_io = [] # all inoffensive

In [13]:
df = pd.read_csv("data/convabuse.csv")

of = df[(df["is_abuse.-1"]+df["is_abuse.-2"]+df["is_abuse.-3"])==1]
io = df[df["is_abuse.1"]==1]

print(f"offensive: {len(of)}, inoffensive: {len(io)}")
print(f"Unique > offensive: {len(of.user.unique())}, inoffensive: {len(io.user.unique())}")

# to get cumulative votes (not required)
# len(of.groupby(["user"])["is_abuse.-1", "is_abuse.-2", "is_abuse.-3"].apply(lambda x : x.astype(int).sum()))

total_of += list(of.user.unique())
total_io += list(io.user.unique())

print(f"Totals > offensive: {len(total_of)}, inoffensive: {len(total_io)}")

offensive: 2029, inoffensive: 10068
Unique > offensive: 758, inoffensive: 2515
Totals > offensive: 758, inoffensive: 2515


In [14]:
filenames = os.listdir("data/cyberbullying dataset/")
sum_of = sum_io = 0
for filename in filenames:
    print(f"Parsing {filename}...")
    df = pd.read_csv(f"data/cyberbullying dataset/{filename}")
    of = df[df.oh_label==1]
    io = df[df.oh_label==0]

    print(f"offensive: {len(of)}, inoffensive: {len(io)}")
    print(f"Unique > offensive: {len(of.Text.unique())}, inoffensive: {len(io.Text.unique())}")
    sum_of += len(of.Text.unique())
    sum_io += len(io.Text.unique())

    total_of += list(of.Text.unique())
    total_io += list(io.Text.unique())
    print("____")

print(sum_of, sum_io)
print(f"Totals > offensive: {len(total_of)}, inoffensive: {len(total_io)}")

Parsing toxicity_parsed_dataset.csv...
offensive: 15362, inoffensive: 144324
Unique > offensive: 15348, inoffensive: 144041
____
Parsing attack_parsed_dataset.csv...
offensive: 13590, inoffensive: 102274
Unique > offensive: 13576, inoffensive: 102092
____
Parsing aggression_parsed_dataset.csv...
offensive: 14782, inoffensive: 101082
Unique > offensive: 14768, inoffensive: 100901
____
Parsing twitter_parsed_dataset.csv...
offensive: 5347, inoffensive: 11501
Unique > offensive: 5347, inoffensive: 11501
____
Parsing twitter_racism_parsed_dataset.csv...
offensive: 1970, inoffensive: 11501
Unique > offensive: 1970, inoffensive: 11501
____
Parsing twitter_sexism_parsed_dataset.csv...
offensive: 3377, inoffensive: 11501
Unique > offensive: 3377, inoffensive: 11501
____
Parsing youtube_parsed_dataset.csv...
offensive: 417, inoffensive: 3047
Unique > offensive: 417, inoffensive: 3045
____
Parsing kaggle_parsed_dataset.csv...
offensive: 2806, inoffensive: 5993
Unique > offensive: 2778, inoffensi

In [15]:
df = pd.read_csv("data/labelled_hate_speech_dataset.csv")

of = df[df.Hateful==1]
io = df[df.Hateful==0]

print(f"offensive: {len(of)}, inoffensive: {len(io)}")
print(f"Unique > offensive: {len(of.Comment.unique())}, inoffensive: {len(io.Comment.unique())}")

total_of += list(of.Comment.unique())
total_io += list(io.Comment.unique())

print(f"Totals > offensive: {len(total_of)}, inoffensive: {len(total_io)}")

offensive: 600, inoffensive: 2400
Unique > offensive: 599, inoffensive: 2399
Totals > offensive: 58938, inoffensive: 395466


In [16]:
df = pd.read_csv("data/davidson_hate_speech_offensive_lang.csv")

of = df[df["class"] < 2]
io = df[df["class"] == 2]

print(f"offensive: {len(of)}, inoffensive: {len(io)}")
print(f"Unique > offensive: {len(of.tweet.unique())}, inoffensive: {len(io.tweet.unique())}")

total_of += list(of.tweet.unique())
total_io += list(io.tweet.unique())

print(f"Totals > offensive: {len(total_of)}, inoffensive: {len(total_io)}")

offensive: 20620, inoffensive: 4163
Unique > offensive: 20620, inoffensive: 4163
Totals > offensive: 79558, inoffensive: 399629


In [6]:
len(total_of)+len(total_io)

479187

In [7]:
# removing duplicates from both sets of data
total_of = list(set(total_of))
total_io = list(set(total_io))

print(f"Totals > offensive: {len(total_of)}, inoffensive: {len(total_io)}")

Totals > offensive: 49683, inoffensive: 213165


In [9]:
data = pd.DataFrame({
    "text": total_io+total_of,
    "objectionable": [0]*len(total_io)+[1]*len(total_of)
})

data

Unnamed: 0,text,objectionable
0,== Sean Kennedy (Author) is back!!!... == ....,0
1,I'm gonna try and totally revamp the page as...,0
2,"` , and others. Listen to me. Right now it ...",0
3,I will tell you to stop.You think you a famo...,0
4,":Look at this, everybody is Celtic probably h...",0
...,...,...
262843,", after I let him suck my boobs,",1
262844,Horny cougar gets two cocks shoved in her&#160...,1
262845,` == is a scientifically proven epitome of a...,1
262846,Idgaf what anyone say. GAMBLING IS STUPID. lik...,1


In [10]:
# making stratified
# train-validation-test splits
# in the ration 70:15:15

X_train, X_val, y_train, y_val = train_test_split(data.text, data.objectionable, test_size=0.30, random_state=42, stratify=data.objectionable)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50, random_state=42, stratify=y_val)

print(len(X_train), len(X_val), len(X_test))

183993 39427 39428


In [11]:
sum(y_train)/len(y_train), sum(y_val)/len(y_val), sum(y_test)/len(y_test)

(0.18901806046969177, 0.18900753290892028, 0.18902810185654864)

In [None]:
pd.DataFrame(list(zip(X_train, y_train)), columns=["text", "objectionable"]).to_csv("splits/train.csv", index=False)
pd.DataFrame(list(zip(X_val, y_val)), columns=["text", "objectionable"]).to_csv("splits/val.csv", index=False)
pd.DataFrame(list(zip(X_test, y_test)), columns=["text", "objectionable"]).to_csv("splits/test.csv", index=False)