## Dedicated Jupyter Notebook for Exploring Cyberbullying Dataset

In [30]:
import pandas as pd
from collections import Counter
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
# Installing NLTK Corpora
#nltk.download('punkt')
#nltk.download('stopwords')

### Cyberbullying Dataset

In [3]:
# Load Dataset
tweets = pd.read_csv("data/cyberbullying_tweets.csv")
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@stockputout everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


### Label Values

In [4]:
tweets["cyberbullying_type"].value_counts()

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

### Basic Dataset Statistics

In [5]:
# Useful Kaggle Kernel for Pre-Processing Tweets:
# https://www.kaggle.com/code/nourberkdar/text-preprocessing-for-tweets

In [6]:
# Examining word count statistics
tweets['word_count']=tweets['tweet_text'].apply(lambda x:len(x.split(" ")))
tweets['word_count'].describe()

count    47692.000000
mean        23.734505
std         15.263513
min          1.000000
25%         13.000000
50%         21.000000
75%         32.000000
max        737.000000
Name: word_count, dtype: float64

In [7]:
# Tweet Length Statistics
tweets['tweet_length']=tweets['tweet_text'].apply(len)
tweets['tweet_length'].describe()

count    47692.000000
mean       136.247085
std         85.042338
min          1.000000
25%         78.000000
50%        124.000000
75%        180.000000
max       4962.000000
Name: tweet_length, dtype: float64

In [8]:
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,word_count,tweet_length
0,"In other words #katandandre, your food was cra...",not_cyberbullying,9,61
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,14,115
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,9,60
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,19,103
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,19,103
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,23,131
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,10,54
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,14,63
8,@stockputout everything but mostly my priest,not_cyberbullying,6,44
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,9,50


### Dataset Vocabulary

In [9]:
# Most common @username
# Most Common Words
vocab = []
for tweet in tweets["tweet_text"]:
    for word in tweet.split(" "):
        vocab.append(word.lower())

# List of all words (Not Cleaned)
vocab[0:10]

['in',
 'other',
 'words',
 '#katandandre,',
 'your',
 'food',
 'was',
 'crapilicious!',
 '#mkr',
 'why']

In [10]:
vocab_counts = Counter(vocab)
# Least Common Words
vocab_counts.most_common()[-10::]
# Cleaning Needed

[('“abominable', 1),
 ('conclave', 1),
 ('demons”', 1),
 ('exception)', 1),
 ('“issued', 1),
 ('decree', 1),
 ('community.”', 1),
 ('reeeeeal', 1),
 ('d:&lt;', 1),
 ('@chillshrammy:', 1)]

#### Vocab Via Word Tokenization

In [11]:
vocab_tokens = []
for tweet in tweets["tweet_text"]:
    for word in word_tokenize(tweet, language="english"):
        vocab_tokens.append(word.lower())

# List of all words (Not Cleaned)
vocab_tokens[0:10]

['in',
 'other',
 'words',
 '#',
 'katandandre',
 ',',
 'your',
 'food',
 'was',
 'crapilicious']

In [12]:
vocab_token_counts = Counter(vocab_tokens)
# Least Common Words
vocab_token_counts.most_common()[-10::]
# Cleaning Needed

[('keithbishop64', 1),
 ('yourfavwhiteguy', 1),
 ('upabout', 1),
 ('nagging', 1),
 ('depended', 1),
 ('abominable', 1),
 ('conclave', 1),
 ('decree', 1),
 ('reeeeeal', 1),
 ('chillshrammy', 1)]

### Cleaning

In [13]:
# @user subbing function to apply to whole text column
def sub_usernames(text):
    return re.sub(r'@\w+', '@user', text)

#  Currently subsititutes all @usernames with "username"
def replace_usernames(text):
    return re.sub(r'@\w+', 'username', text)

def remove_hashtag(text):
    return re.sub(r'#', '', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(text)
    return(" ".join(words))

stop_words = stopwords.words("english")
def remove_stop_words(tweet, stopwords=stop_words):
    cleaned = []
    for word in word_tokenize(tweet):
        if word.lower() not in stopwords:
            cleaned.append(word.lower()) # Lowercase output
    return " ".join(cleaned)

def isolate_mentions(tweet):
    return " ".join(re.findall(r'@\w+', tweet))

In [14]:
# Cleaning Text Column
tweets["clean_text"] = tweets["tweet_text"].apply(replace_usernames)
tweets["clean_text"] = tweets["clean_text"].apply(remove_hashtag)
tweets["clean_text"] = tweets["clean_text"].apply(remove_punctuation)
tweets["clean_text"] = tweets["clean_text"].apply(remove_stop_words)
tweets["mentions"] = tweets["tweet_text"].apply(isolate_mentions)
# Python Pipe Function?????????
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,word_count,tweet_length,clean_text,mentions
0,"In other words #katandandre, your food was cra...",not_cyberbullying,9,61,words katandandre food crapilicious mkr,
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,14,115,aussietv white mkr theblock imacelebrityau tod...,
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,9,60,username classy whore red velvet cupcakes,@XochitlSuckkks
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,19,103,username meh p thanks heads concerned another ...,@Jason_Gio
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,19,103,username isis account pretending kurdish accou...,@RudhoeEnglish
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,23,131,username username yes test god good bad indiff...,@Raja5aab @Quickieleaks
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,10,54,itu sekolah ya bukan tempat bully ga jauh kaya...,
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,14,63,karma hope bites kat butt nasty mkr,
8,@stockputout everything but mostly my priest,not_cyberbullying,6,44,username everything mostly priest,@stockputout
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,9,50,rebecca black drops school due bullying,


#### Cleaned Vocabulary

In [15]:
# Tokenizing vocabulary
vocab_clean_tokens = []
for tweet in tweets["clean_text"]:
    for word in word_tokenize(tweet, language="english"):
        vocab_clean_tokens.append(word.lower())

# List of all words
vocab_clean_tokens[0:10]

['words',
 'katandandre',
 'food',
 'crapilicious',
 'mkr',
 'aussietv',
 'white',
 'mkr',
 'theblock',
 'imacelebrityau']

In [16]:
vocab_clean_token_counts = Counter(vocab_clean_tokens)
# Least Common Words
vocab_clean_token_counts.most_common()[-10::]
# Cleaning Needed

[('heartbroken', 1),
 ('wwi', 1),
 ('acquiesce', 1),
 ('upabout', 1),
 ('nagging', 1),
 ('depended', 1),
 ('abominable', 1),
 ('conclave', 1),
 ('decree', 1),
 ('reeeeeal', 1)]

In [17]:
# Least Common Words
vocab_clean_token_counts.most_common()[0:8]

[('username', 26956),
 ('school', 8873),
 ('like', 5918),
 ('fuck', 5885),
 ('dumb', 5393),
 ('high', 5260),
 ('people', 4906),
 ('bullied', 4703)]

In [18]:
text = tweets.iloc[5]["tweet_text"]
print(text)
isolate_mentions(text)

@Raja5aab @Quickieleaks Yes, the test of god is that good or bad or indifferent or weird or whatever, it all proves gods existence.


'@Raja5aab @Quickieleaks'

In [19]:
mentions = []
for line in tweets["mentions"]:
    for mention in line.split(" "):
        if mention != "":
            mentions.append(mention)

# List of all words
mentions[0:10]

['@XochitlSuckkks',
 '@Jason_Gio',
 '@RudhoeEnglish',
 '@Raja5aab',
 '@Quickieleaks',
 '@stockputout',
 '@Jord_Is_Dead',
 '@Kurdsnews',
 '@yasmimcaci',
 '@Bferrarii']

In [20]:
mention_counts = Counter(mentions)
# Least Common Words
mention_counts.most_common()[-10::]

[('@FreakyBillon', 1),
 ('@niggalogic', 1),
 ('@SuicideRascal', 1),
 ('@Psyche_Mac', 1),
 ('@ki__2x', 1),
 ('@skeppyextra', 1),
 ('@ranran_42', 1),
 ('@KeithBishop64', 1),
 ('@YourFavWhiteGuy', 1),
 ('@CHILLShrammy', 1)]

In [21]:
# Most Common Words
mention_counts.most_common()[0:10]

[('@tayyoung_', 958),
 ('@freebsdgirl', 233),
 ('@MaxBlumenthal', 161),
 ('@mykitchenrules', 115),
 ('@ChrisWarcraft', 104),
 ('@MT8_9', 101),
 ('@TheQuinnspiracy', 101),
 ('@IsraeliRegime', 99),
 ('@Spacekatgal', 98),
 ('@realDonaldTrump', 94)]

### Examining only the Cyberbullying Examples

In [22]:
tweets_bully = tweets[tweets["cyberbullying_type"] != "not_cyberbullying"]
tweets_bully.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,word_count,tweet_length,clean_text,mentions
7945,rape is real..zvasiyana nema jokes about being...,gender,26,159,rape real zvasiyana nema jokes drunk gay lesbi...,
7946,You never saw any celebrity say anything like ...,gender,24,136,never saw celebrity say anything like obama b ...,
7947,"@ManhattaKnight I mean he's gay, but he uses g...",gender,14,80,username mean gay uses gendered slurs makes ra...,@ManhattaKnight
7948,RT @Raul_Novoa16: @AliciaBernardez @Alex_Aim @...,gender,7,66,rt username username username username feminazi,@Raul_Novoa16 @AliciaBernardez @Alex_Aim @_mec...
7949,Rape is rape. And the fact that I read one pos...,gender,58,273,rape rape fact read one post guy getting raped...,
7950,"@coiny Also, it's hard to take a company serio...",gender,27,144,username also hard take company seriously hara...,@coiny
7951,"Idgaf if you are gay, lesbian, bisexual, or wh...",gender,25,127,idgaf gay lesbian bisexual whatever fuck fuck ...,
7952,#GermanProfessor gives meaning to term FemiNaz...,gender,14,104,germanprofessor gives meaning term feminazi hi...,
7953,RT @mcclure111: #DontDateSJWs #ThatWouldBeAVio...,gender,4,73,rt username dontdatesjws thatwouldbeaviolation...,@mcclure111
7954,So I call you female I’m basically calling you...,gender,12,56,call female basically calling bitch,


In [23]:
mentions_bullying = []
for line in tweets_bully["mentions"]:
    for mention in line.split(" "):
        if mention != "":
            mentions_bullying.append(mention)

# List of all words
mentions_bullying[0:10]

['@ManhattaKnight',
 '@Raul_Novoa16',
 '@AliciaBernardez',
 '@Alex_Aim',
 '@_mecaesmal',
 '@coiny',
 '@mcclure111',
 '@pumpkinking39',
 '@JesseElJefe',
 '@beavergate']

In [24]:
mentions_bullying_counts = Counter(mentions_bullying)
# Least Common Mentions
mentions_bullying_counts.most_common()[-10::]

[('@FreakyBillon', 1),
 ('@niggalogic', 1),
 ('@SuicideRascal', 1),
 ('@Psyche_Mac', 1),
 ('@ki__2x', 1),
 ('@skeppyextra', 1),
 ('@ranran_42', 1),
 ('@KeithBishop64', 1),
 ('@YourFavWhiteGuy', 1),
 ('@CHILLShrammy', 1)]

In [25]:
# Most Common Mentions
mentions_bullying_counts.most_common()[0:10]

[('@tayyoung_', 958),
 ('@freebsdgirl', 158),
 ('@MaxBlumenthal', 120),
 ('@MT8_9', 94),
 ('@realDonaldTrump', 94),
 ('@IsraeliRegime', 82),
 ('@ChrisWarcraft', 81),
 ('@TheQuinnspiracy', 76),
 ('@sajid_fairooz', 65),
 ('@Spacekatgal', 64)]

In [26]:
vocab_bullying = []
for tweet in tweets_bully["clean_text"]:
    for word in word_tokenize(tweet):
        vocab_bullying.append(word)

# List of all vocab_bullying[0:10]
vocab_bullying[0:10]

['rape',
 'real',
 'zvasiyana',
 'nema',
 'jokes',
 'drunk',
 'gay',
 'lesbian',
 'rape',
 'ones']

In [27]:
vocab_bullying_counts = Counter(vocab_bullying)
# Least Common Words
vocab_bullying_counts.most_common()[-10::]

[('wwi', 1),
 ('acquiesce', 1),
 ('upabout', 1),
 ('nagging', 1),
 ('depended', 1),
 ('abominable', 1),
 ('conclave', 1),
 ('decree', 1),
 ('whim', 1),
 ('reeeeeal', 1)]

In [28]:
# Most Common Words
vocab_bullying_counts.most_common()[0:8]

[('username', 20798),
 ('school', 8531),
 ('fuck', 5784),
 ('like', 5518),
 ('dumb', 5378),
 ('high', 5203),
 ('people', 4634),
 ('bullied', 4621)]

In [39]:
fig = px.histogram(tweets[tweets["tweet_length"] < 500], x="tweet_length", nbins=20, marginal="box", title="Histogram of Tweet Length")
fig.update_layout(xaxis_title="Tweet Length", yaxis_title="Frequency")
fig.show()

In [44]:
fig = px.histogram(tweets[tweets["word_count"] < 75], x="word_count", nbins=20, marginal="box", title="Histogram of Word Count")
fig.update_layout(xaxis_title="Word Count", yaxis_title="Frequency")
fig.show()


In [50]:
fig = px.histogram(tweets_bully, x="cyberbullying_type", title="Count of Cyberbullying Types")
fig.update_layout(xaxis_title="Cyberbullying Type", yaxis_title="Count")
fig.show()
