### Filtering and Mutating Dataset for Training

In [34]:
#!pip3 install langdetect

In [2]:
import pandas as pd
import re
import numpy as np

In [3]:
# Load Dataset
tweets = pd.read_csv("data/cyberbullying_tweets.csv")
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@stockputout everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


In [4]:
# Looking at one tweet
prototweet = tweets.iloc[2]
print(prototweet["tweet_text"])
print(prototweet["cyberbullying_type"])

@XochitlSuckkks a classy whore? Or more red velvet cupcakes?
not_cyberbullying


In [5]:
# Looking at types of Cyberbullying tweets
tweets["cyberbullying_type"].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [6]:
tweets.shape #There are 47692 tweets currently

(47692, 2)

In [7]:
# Extracting single tweet to test on
text = prototweet["tweet_text"]
text = text

#### Using Regular Expressions to Mutate Tweet Column

Substituting all @'s with a generic @user token. Reduce data variability

In [8]:
# Subbing usernames with regex
text = re.sub(r'@\w+', '@user', text)
print(text)

@user a classy whore? Or more red velvet cupcakes?


In [9]:
# @user subbing function to apply to whole text column
def sub_usernames(text):
    return re.sub(r'@\w+', '@user', text)

In [10]:
# Mutating text column
tweets["tweet_text"] = tweets["tweet_text"].apply(sub_usernames)
tweets.head(10)

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@user a classy whore? Or more red velvet cupca...,not_cyberbullying
3,"@user meh. :P thanks for the heads up, but no...",not_cyberbullying
4,@user This is an ISIS account pretending to be...,not_cyberbullying
5,"@user @user Yes, the test of god is that good ...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@user everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


In [27]:
#Checking the language of the tweet
from langdetect import detect

def detect_tweet(tweets):
    non_english_tweets = 0
    total_tweets = len(tweets)
    list_non_english = []

    for i in range (total_tweets):
        if (i % 1000 == 0):
            print(f"Stauts: {round((i/total_tweets) * 100, 2)}%")
        try:
            lang = detect(tweets.iloc[i]['tweet_text'])
            if lang != 'en':
                non_english_tweets += 1
                list_non_english.append(i)
        except:
            pass

    print(f"Total tweets: {total_tweets}")
    print(f"Non-English tweets: {non_english_tweets}")
#print(f"Row Numbers of Non-English tweets {list_non_english}")

In [14]:
non_english_tweets = tweets.iloc[list_non_english]
english_tweets = tweets.drop(tweets.index[list_non_english])

print(f"Non-English tweets have: {non_english_tweets.shape[0]} tweets")
print(f"English tweets have: {english_tweets.shape[0]} tweets")

Non-English tweets have: 3115 tweets
English tweets have: 44577 tweets


In [17]:
english_tweets.to_csv('english_tweets.csv', index = False)
non_english_tweets.to_csv('non_english_tweets.csv', index = False)

### Lowercasing and Removing Stop Words

In [20]:
import nltk
#import spacy -- Spacy is generally faster than nltk


In [24]:
from multiprocessing import Pool

In [23]:
nlp = spacy.load('en_core_web_md', disable =['ner', 'parser', 'textcat'])

In [None]:
def remove_stopwords(text):
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop]
    return ' '.join(words)

english_tweets['tweets'] = english_tweets["tweet_text"].apply(remove_stopwords)
english_tweets.head(10)