## Dedicated Jupyter Notebook for Exploring Cyberbullying Dataset

In [1]:
import pandas as pd
from collections import Counter
import re
import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

### Cyberbullying Dataset

In [2]:
# Load Dataset
tweets = pd.read_csv("data/cyberbullying_tweets.csv")
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@stockputout everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


### Basic Dataset Statistics

In [3]:
# Useful Kaggle Kernel for Pre-Processing Tweets:
# https://www.kaggle.com/code/nourberkdar/text-preprocessing-for-tweets

In [4]:
# Examining word count statistics
tweets['word_count']=tweets['tweet_text'].apply(lambda x:len(x.split(" ")))
tweets['word_count'].describe()

count    47692.000000
mean        23.734505
std         15.263513
min          1.000000
25%         13.000000
50%         21.000000
75%         32.000000
max        737.000000
Name: word_count, dtype: float64

In [5]:
# Tweet Length Statistics
tweets['tweet_length']=tweets['tweet_text'].apply(len)
tweets['tweet_length'].describe()

count    47692.000000
mean       136.253229
std         85.226899
min          1.000000
25%         78.000000
50%        124.000000
75%        180.000000
max       5018.000000
Name: tweet_length, dtype: float64

In [6]:
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,word_count,tweet_length
0,"In other words #katandandre, your food was cra...",not_cyberbullying,9,61
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,14,115
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,9,60
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,19,103
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,19,103
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,23,131
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,10,54
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,14,63
8,@stockputout everything but mostly my priest,not_cyberbullying,6,44
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,9,50


### Dataset Vocabulary

In [7]:
# Most common @username
# Most Common Words
vocab = []
for tweet in tweets["tweet_text"]:
    for word in tweet.split(" "):
        vocab.append(word.lower())

# List of all words (Not Cleaned)
vocab[0:10]

['in',
 'other',
 'words',
 '#katandandre,',
 'your',
 'food',
 'was',
 'crapilicious!',
 '#mkr',
 'why']

In [8]:
vocab_counts = Counter(vocab)
# Least Common Words
vocab_counts.most_common()[-10::]
# Cleaning Needed

[('“abominable', 1),
 ('conclave', 1),
 ('demons”', 1),
 ('exception)', 1),
 ('“issued', 1),
 ('decree', 1),
 ('community.”', 1),
 ('reeeeeal', 1),
 ('d:&lt;', 1),
 ('@chillshrammy:', 1)]

#### Word Tokenization

In [9]:
vocab_tokens = []
for tweet in tweets["tweet_text"]:
    for word in word_tokenize(tweet, language="english"):
        vocab_tokens.append(word.lower())

# List of all words (Not Cleaned)
vocab_tokens[0:10]

['in',
 'other',
 'words',
 '#',
 'katandandre',
 ',',
 'your',
 'food',
 'was',
 'crapilicious']

In [10]:
vocab_token_counts = Counter(vocab_tokens)
# Least Common Words
vocab_token_counts.most_common()[-10::]
# Cleaning Needed

[('keithbishop64', 1),
 ('yourfavwhiteguy', 1),
 ('upabout', 1),
 ('nagging', 1),
 ('depended', 1),
 ('abominable', 1),
 ('conclave', 1),
 ('decree', 1),
 ('reeeeeal', 1),
 ('chillshrammy', 1)]

#### Stinky ^^^

In [11]:
# @user subbing function to apply to whole text column
def sub_usernames(text):
    return re.sub(r'@\w+', '@user', text)

#  Currently subsititutes all @usernames with "username"
def remove_usernames(text):
    return re.sub(r'@\w+', 'username', text)

def remove_hashtag(text):
    return re.sub(r'#', '', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(text)
    return(" ".join(words))

stop_words = stopwords.words("english")
def remove_stop_words(tweet, stopwords=stop_words):
    cleaned = []
    for word in word_tokenize(tweet):
        if word.lower() not in stopwords:
            cleaned.append(word.lower()) # Lowercase output
    return " ".join(cleaned)

In [12]:
# Cleaning a bit
# Mutating text column
tweets["clean_text"] = tweets["tweet_text"].apply(remove_usernames)
tweets["clean_text"] = tweets["clean_text"].apply(remove_hashtag)
tweets["clean_text"] = tweets["clean_text"].apply(remove_punctuation)
tweets["clean_text"] = tweets["clean_text"].apply(remove_stop_words)
# I bet python has a pipe function I dont know about
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,word_count,tweet_length,clean_text
0,"In other words #katandandre, your food was cra...",not_cyberbullying,9,61,words katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,14,115,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,9,60,username classy whore red velvet cupcakes
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,19,103,username meh p thanks heads concerned another ...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,19,103,username isis account pretending kurdish accou...
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,23,131,username username yes test god good bad indiff...
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,10,54,itu sekolah ya bukan tempat bully ga jauh kaya...
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,14,63,karma hope bites kat butt nasty mkr
8,@stockputout everything but mostly my priest,not_cyberbullying,6,44,username everything mostly priest
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,9,50,rebecca black drops school due bullying
