In [1]:
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
nltk.download('stopwords')
from nltk.util import pr
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Load the data
df = pd.read_csv(r"../data/twitter_data.csv")
print(df.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [3]:
# Map the 'class' column to 'labels'
df['labels'] = df['class'].map({0: "Hate Speech Detected", 1: "Offensive language detected", 2: "No hate and offensive speech"})
print(df.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No hate and offensive speech  
1   Offensive language detected  
2   Offensive language detected  
3   Offensive language detected  
4   Offensive language detected  


In [4]:
if 'class' not in df.columns:
    raise KeyError("The 'class' column does not exist in the DataFrame.")

# Retain a copy of the 'class' column for later filtering
class_column = df['class'].copy()

# Select relevant columns
df = df[['tweet', 'labels', 'class']]  # Keep 'class' for filtering later
print(df.head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  class  
0  No hate and offensive speech      2  
1   Offensive language detected      1  
2   Offensive language detected      1  
3   Offensive language detected      1  
4   Offensive language detected      1  


In [5]:
# Clean the text data
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = [word for word in text.split() if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)
    return text

df["tweet"] = df["tweet"].apply(clean)
print(df.head())

                                               tweet  \
0  rt mayasolov woman shouldnt complain clean hou...   
1   rt boy dat coldtyga dwn bad cuffin dat hoe place   
2  rt urkindofbrand dawg rt ever fuck bitch start...   
3             rt cganderson vivabas look like tranni   
4  rt shenikarobert shit hear might true might fa...   

                         labels  class  
0  No hate and offensive speech      2  
1   Offensive language detected      1  
2   Offensive language detected      1  
3   Offensive language detected      1  
4   Offensive language detected      1  


In [6]:
# Drop rows with missing values
df = df.dropna(subset=['tweet', 'labels'])

In [7]:
# Convert the text data to arrays
x = np.array(df["tweet"])
y = np.array(df["class"])  # Use 'class' column for training

# Vectorize the text data
cv = CountVectorizer()
x_cv = cv.fit_transform(x)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_cv, y, test_size=0.33, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [8]:
# Filter the original DataFrame by class column
hate = df[class_column == 0]
offensive = df[class_column == 1]
neither = df[class_column == 2]

print(hate.head())
print(offensive.head())
print(neither.head())

                                                 tweet                labels  \
85                          whalelookyher queer gaywad  Hate Speech Detected   
89   whitethunduh alsarabsss hes beaner smh tell he...  Hate Speech Detected   
110  devilgrimz vigxrart your fuck gay blacklist ho...  Hate Speech Detected   
184  markroundtreejr lmfaoooo hate black peopl ther...  Hate Speech Detected   
202                   nochillpaz least im nigger lmfao  Hate Speech Detected   

     class  
85       0  
89       0  
110      0  
184      0  
202      0  
                                               tweet  \
1   rt boy dat coldtyga dwn bad cuffin dat hoe place   
2  rt urkindofbrand dawg rt ever fuck bitch start...   
3             rt cganderson vivabas look like tranni   
4  rt shenikarobert shit hear might true might fa...   
5  tmadisonx shit blow meclaim faith somebodi sti...   

                        labels  class  
1  Offensive language detected      1  
2  Offensive language de

In [9]:
print(hate.shape)
print(offensive.shape)
print(neither.shape)

(1430, 3)
(19190, 3)
(4163, 3)


In [10]:
'''
Under-sampling to build a sample dataset containing similar distribution of hate,
offensive, and neither hate nor offensive speech
'''

offensive_sample = offensive.sample(n = 1430, random_state=42)
neither_sample = neither.sample(n = 1430, random_state=42)

new_dataset = pd.concat([offensive_sample, neither_sample, hate], axis = 0)

new_dataset.head()

Unnamed: 0,tweet,labels,class
22889,everytim go cracker barrel girl great tit seat...,Offensive language detected,1
20565,run nigga dont want nigga need bad bitch that ...,Offensive language detected,1
10780,need girl jamaica cant fuck basic black bitch,Offensive language detected,1
17261,rt shadowbeatzinc know block next week bitch,Offensive language detected,1
13954,put ya hand grade bitch,Offensive language detected,1


In [11]:
new_dataset.tail()

Unnamed: 0,tweet,labels,class
24576,guy biggest faggot omfg,Hate Speech Detected,0
24685,one name offens kike wop kraut wetback jigaboo...,Hate Speech Detected,0
24751,pussi ass nigga know nigga,Hate Speech Detected,0
24776,your nigger,Hate Speech Detected,0
24777,your retard hope get type diabet die sugar rus...,Hate Speech Detected,0


# Enter text below and click 'Detect' to check for toxicity

In [12]:
# Define the labels mapping
labels_mapping = {0: "Hate Speech Detected", 1: "Offensive language detected", 2: "No hate and offensive speech"}

# Create the text box widget with increased width
text_box = widgets.Textarea(
    value='',
    placeholder='Type your text here...',
    description='Text:',
    disabled=False,
    layout=widgets.Layout(width='80%')  # Set the width to 80% of the available space
)

# Create the button widget with vertical alignment
button = widgets.Button(
    description='Detect',
    disabled=False,
    button_style='',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to detect',
    icon='check',
    layout=widgets.Layout(margin='20px 30px 30px 60px')  # Adjust the margin to align vertically
)

output = widgets.Output()

def on_button_click(b):
    with output:
        try:
            output.clear_output()
            input_text = text_box.value
            cleaned_text = clean(input_text)
            input_vec = cv.transform([cleaned_text])
            prediction = clf.predict(input_vec)[0]
            result = labels_mapping[prediction]
            print("Detection:", result)
        except Exception as e:
            print(f"Error: {e}")

button.on_click(on_button_click)

# Align the button and text box vertically
vbox = widgets.VBox([text_box, button])

display(vbox, output)

VBox(children=(Textarea(value='', description='Text:', layout=Layout(width='80%'), placeholder='Type your text…

Output()