This iPython notebook is dedicated to maintaining the cleaning pipeline for the text data. It contains tools and for processing each tweet as well as a function to generate the cleaned dataset.

In [1]:
# You may have to 
#!python -m spacy download en_core_web_md
#!pip3 install langdetect

In [11]:
import pandas as pd
import re
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer


# NLTK Resources
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

import spacy

from langdetect import detect

# Init Tools
nlp = spacy.load('en_core_web_md', disable =['ner', 'parser', 'textcat'])
# This leaves #words and @words untouched
tokenizer = RegexpTokenizer(r"(@\w+|#\w+|\w+)") # r-string literal
# Lemmatizer
lemm = WordNetLemmatizer()

In [12]:
# Checking the NLTK Stop Words
from nltk.corpus import stopwords
stpWords = stopwords.words('english')
print("Stop words: ")
print(', '.join(stpWords))

# List of Parts-of-Speech to remove
stpPOS = ['d', 'x', '.', 'p', 'c']

Stop words: 
i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't

#### Helper Functions

In [13]:
# Extracts the username's mentioned in a tweet
def extract_usernames(text):
    usernames = re.findall(r'@\s*\w+', text)
    return " ".join([user.strip('@') for user in usernames])

# @user subbing function to apply to whole text column
def sub_usernames(text):
    cleaned = re.sub(r'@\w+', '@username', text)
    return cleaned

# Extracts the Hashtags and returns them as a single string
def extract_hashtags(text):
    hashtags = re.findall(r'#\s*\w+', text)
    return " ".join([tag.strip('#') for tag in hashtags])

# Replaces any Hashtag with a generic #hashtag
def remove_hashtags(text):
    cleaned = re.sub(r'#\w+', '#hashtag', text)
    return cleaned

# Remove Punctuation from text
def remove_punctuation(text):
    cleaned = tokenizer.tokenize(text)
    #return ' '.join(cleaned) # TEMP
    return cleaned

# Helper function to prep the POS tagged words for lemmatizing
# Code sampled from my NLTK Repo:
# https://github.com/JacksonCown/Workshop-NLTK/blob/main/NLTK%20Short%20Examples-checkpoint.ipynb
def pos_for_lem(words_pos):
    newTags = []
    for tup in words_pos:
        converted = tup[1][0].lower() # Takes the first letter of the lowercase tag
        newTup = (tup[0], converted)
        newTags.append(newTup)
    return newTags

# Tag tweet parts-of-speech for lemmatizer
# Input tweet must be tokenized list
# Must be done before removing stop words
def tag_pos(words):
    words = nltk.pos_tag(words, tagset='universal') # universal tagset
    # Process with helper fuction before returning
    return pos_for_lem(words)

# Remove Spacy Stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop]
    return ' '.join(words)

# This uses NLTK stop words, it will be much slower for now but it solves
# an issues I am having where "#hashtag" --> "# hashtag"
# Remove stop words after pos tagging
def remove_stopwords_pos(text_pos):
    cleaned_list = []
    for word in text_pos:
        word_lower = word[0].lower()
        # This can be improved
        if word_lower not in stpWords and word[1] not in stpPOS:
            cleaned_list.append((word_lower, word[1]))
    return cleaned_list

# Lemmatize the part of speech tagged tweet
def lemmatize_tweet(text_pos):
    lemmatized = [lemm.lemmatize(word, pos) for word, pos in text_pos]
    return ' '.join(lemmatized)

# Detect Language of Tweet Text
def detect_language(text):
    # Attempt to Detect Tweet Language
    try:
        lang = detect(text)
        return lang
    except:
        return "lang_error"

#### Execution Order Per Tweet:

1. Detect and Filter Language
2. Clean Usernames
3. Clean Hashtags
4. Clean Punctuation
5. Tag Parts-Of-Speech
6. Clean Stop Words
7. Lemmatize

#### Main Cleaning Functions

In [14]:
# Create column of labels for language detected
def df_lang_detect(df):
    df["lang"] = df["tweet_text"].apply(detect_language)
    return df

# Remove all stop words from the dataframe tweet text
# Also create a new column containing the extracted usernames
def df_clean_usernames(df):
    df["mentioned_users"] = df["tweet_text"].apply(extract_usernames)
    df["tweet_text"] = df["tweet_text"].apply(sub_usernames)
    return df

# Remove all hashtags from the dataframe tweet text
# Also create a new column containing the extracted hashtags
def df_clean_hashtags(df):
    df["hashtags"] = df["tweet_text"].apply(extract_hashtags)
    df["tweet_text"] = df["tweet_text"].apply(remove_hashtags)
    return df

# Remove all punctuation from the data frame tweet text
def df_clean_punctuation(df):
    df["tweet_text"] = df["tweet_text"].apply(remove_punctuation)
    return df

# Tag all Parts-of-Speech in the tweet column
def df_pos_tag(df):
    df["tweet_text"] = df["tweet_text"].apply(tag_pos)
    return df

# Remove all stop words from the data frame tweet text.
# Much slower using remove_stopwords_pos than remove_stopwords
# while I am trying to fix issue with hashtag tokenization
def df_clean_stopwords(df):
    df["tweet_text"] = df["tweet_text"].apply(remove_stopwords_pos)
    return df

# Lemmatize the tweet text column of the dataframe
# Tag parts of speech and remove stopwords first
def df_lemmatize(df):
    df["tweet_text"] = df["tweet_text"].apply(lemmatize_tweet)
    return df

# Entire Cleaning Pipeline
def df_clean(df):
    print("Cleaning DataFrame: Starting\n")
    df = df_lang_detect(df)
    print("Language Filtering: Complete")
    df = df_clean_usernames(df)
    print("Substitute Usernames: Complete")
    df = df_clean_hashtags(df)
    print("Clean Hashtags: Complete")
    df = df_clean_punctuation(df)
    print("Clean Punctuation: Complete")
    df = df_pos_tag(df)
    print("Tag Parts-of-Speech: Complete")
    df = df_clean_stopwords(df)
    print("Remove Stop Words: Complete")
    df = df_lemmatize(df)
    print("Lemmatize Text: Complete")
    # Filter To English Samples
    return df[df["lang"] == "en"]

# Function to call the cleaning pipeline and write the
# final dataset to memory
def pipeline(path="data/cleaned/cleaned_lemmatized_english.csv"):
    df = pd.read_csv("data/cyberbullying_tweets.csv")
    df = df_clean(df)
    df.to_csv(path)
    print(path)

### Generating the dataset

In [15]:
# Uncomment to generate entire dataset
#pipeline
#pipeline("data/cleaned/nohashtag_cleaned_lemmatized_english.csv")

### Testing on 30 samples

In [16]:
# Uncleaned Dataset
# Testing Entire Process on 30 samples
df = pd.read_csv("data/cyberbullying_tweets.csv").iloc[0:30]
df.head(30)

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@stockputout everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


In [17]:
# Test sample clean
df = df_clean(df)
df.head(30)

Cleaning DataFrame: Starting

Language Filtering: Complete
Substitute Usernames: Complete
Clean Hashtags: Complete
Clean Punctuation: Complete
Tag Parts-of-Speech: Complete
Remove Stop Words: Complete
Lemmatize Text: Complete


Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,word #hashtag food crapilicious #hashtag,not_cyberbullying,en,,katandandre mkr
1,#hashtag white #hashtag #hashtag #hashtag #has...,not_cyberbullying,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,@username classy whore red velvet cupcake,not_cyberbullying,en,XochitlSuckkks,
3,@username meh p thanks head concern angry dude...,not_cyberbullying,en,Jason_Gio,
4,@username isi account pretend kurdish account ...,not_cyberbullying,en,RudhoeEnglish,
5,@username @username yes test god good bad indi...,not_cyberbullying,en,Raja5aab Quickieleaks,
7,karma hope bite kat butt nasty #hashtag,not_cyberbullying,en,,mkr
8,@username everything mostly priest,not_cyberbullying,en,stockputout,
9,rebecca black drop school due bullying,not_cyberbullying,en,,
10,@username http co usqinyw5gn,not_cyberbullying,en,Jord_Is_Dead,


### Testing Functions for individual tweets

In [8]:
# Testing For Singular Tweet Cleaning Fuctions
# Very messy sample tweet to clean
fake_tweet = "Bruh what bruh @bruh #bruh: of to did  the #ruh - #hello # @hello. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]]"
# Very simple pipeline tester
def test_pipeline(tweet=fake_tweet):
    print("Source Tweet: ", tweet, "\n")
    x1 = sub_usernames(fake_tweet)
    print("Sub Usernames X1: ",x1, "\n")
    x2 = remove_hashtags(x1)
    print("Remove Hashtags X2: ",x2, "\n")
    x3 = remove_punctuation(x2)
    print("Remove Punctuation X3: ",x3, "\n")
    x4 = tag_pos(x3) # universal tagset
    print("Tag Parts-of-Speech X4: ",x4, "\n")
    x5 = remove_stopwords_pos(x4)
    print("Remove Stop Words X5: ", x5, "\n")
    x6 = lemmatize_tweet(x5)
    print("Lemmatize Tweet X6: ", x6, "\n")

test_pipeline()

Source Tweet:  Bruh what bruh @bruh #bruh: of to did  the #ruh - #hello # @hello. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]] 

Sub Usernames X1:  Bruh what bruh @username #bruh: of to did  the #ruh - #hello # @username. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]] 

Remove Hashtags X2:  Bruh what bruh @username bruh: of to did  the ruh - hello  @username. I am is wowow, gaming, a videos,: wtf !!! ????? ]]]]] 

Remove Punctuation X3:  Bruh what bruh @username bruh of to did the ruh hello @username I am is wowow gaming a videos wtf 



TypeError: tokens: expected a list of strings, got a string

In [9]:
df = pd.read_csv("data/cyberbullying_tweets.csv").sample(frac=0.2, random_state=42)
df.head(30)

Unnamed: 0,tweet_text,cyberbullying_type
40362,@Goree_JuhssGuns hahaha he ain't even worth my...,ethnicity
15019,RT @hsaymssik: Sucks to have the smile wiped o...,gender
46321,"Just a reminder, it's absolutely disgusting to...",ethnicity
23927,RT @BuzzFeedUK: When you accidentally open you...,other_cyberbullying
1640,Loving the look of the fritters! #mkr,not_cyberbullying
46681,Has 2 interesting events last year involving r...,ethnicity
40139,NIGGERS is the real way you dumb fuck,ethnicity
12668,things that AREN'T jokes - rape - sexism - rac...,gender
36459,"You were bullied? If so, I’m sorry that happen...",age
8442,"Haha did you watch big brother?, “Zankie” was ...",gender
