In [9]:
#!python -m spacy download en_core_web_md
#nltk.download('stopwords')
!pip3 install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=f8b69fa9a587827eeb44818f6955f15c5770b78b405768494b8d8a026fd51e65
  Stored in directory: c:\users\cownj\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [10]:
import pandas as pd
import re
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer
import spacy

from langdetect import detect

# Init Tools
nlp = spacy.load('en_core_web_md', disable =['ner', 'parser', 'textcat'])
# This leaves #words and @words untouched
tokenizer = RegexpTokenizer(r"(@\w+|#\w+|\w+)") # r-string literal

In [11]:
from nltk.corpus import stopwords
stpWords = stopwords.words('english')
print(stpWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### Helper Functions

In [12]:
# Extracts the username's mentioned in a tweet
def extract_usernames(text):
    usernames = re.findall(r'@\s*\w+', text)
    return " ".join([user.strip('@') for user in usernames])

# @user subbing function to apply to whole text column
def sub_usernames(text):
    cleaned = re.sub(r'@\w+', '@username', text)
    return cleaned
    #return re.sub(r'@', '', cleaned)

# Extracts the Hashtags and returns them as a single string
def extract_hashtags(text):
    hashtags = re.findall(r'#\s*\w+', text)
    return " ".join([tag.strip('#') for tag in hashtags])

# Replaces any Hashtag with a generic #hashtag
def remove_hashtags(text):
    cleaned = re.sub(r'#\w+', '#hashtag', text)
    return cleaned
    #return re.sub(r'#', '', cleaned)

# Remove Punctuation from text
def remove_punctuation(text):
    cleaned = tokenizer.tokenize(text)
    return ' '.join(cleaned)

# Remove Spacy Stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop]
    return ' '.join(words)

# This uses NLTK stop words, it will be much slower for now but it solves
# an issues I am having where "#hashtag" --> "# hashtag"
def removeStopWords(words, stop_words=stpWords):
    words = tokenizer.tokenize(words)
    cleaned = []
    for word in words:
        if word.lower() not in stop_words:
            cleaned.append(word.lower()) # Lowercase output
    return ' '.join(cleaned)


# Detect Language of Tweet Text
def detect_language(tweet):
    # Attempt to Detect Tweet Language
    try:
        lang = detect(tweet)
        return lang
    except:
        return "lang_error"

#### Execution Order Per Tweet:

1. Clean Usernames
2. Clean Hashtags
3. Clean Punctuation
4. Clean Stop Words
5. Lemmatize

#### Main Cleaning Functions

In [13]:
# Remove all stop words from the dataframe tweet text
# Also create a new column containing the extracted usernames
def df_clean_usernames(df):
    df["mentioned_users"] = df["tweet_text"].apply(extract_usernames)
    df["tweet_text"] = df["tweet_text"].apply(sub_usernames)
    return df

# Remove all hashtags from the dataframe tweet text
# Also create a new column containing the extracted hashtags
def df_clean_hashtags(df):
    df["hashtags"] = df["tweet_text"].apply(extract_hashtags)
    df["tweet_text"] = df["tweet_text"].apply(remove_hashtags)
    return df

# Remove all punctuation from the data frame tweet text
def df_clean_punctuation(df):
    df["tweet_text"] = df["tweet_text"].apply(remove_punctuation)
    return df

# Remove all stop words from the data frame tweet text.
# Slower using removeStopWords than remove_stopwords
# while I am trying to fix issue with hashtag tokenization
def df_clean_stopwords(df):
    df["tweet_text"] = df["tweet_text"].apply(removeStopWords)
    return df


def df_clean(df):
    df = df_clean_usernames(df)
    df = df_clean_hashtags(df)
    df = df_clean_punctuation(df)
    df = df_clean_stopwords(df)
    return df

# Splits a dataframe into english and non_english tweets and writes them to memory
# Detect Language of the Tweet Text
def detect_tweet(tweets):
    non_english_tweets = 0
    total_tweets = len(tweets)
    list_non_english = []

    for i in range (total_tweets):
        if (i % 1000 == 0):
            print(f"Stauts: {round((i/total_tweets) * 100, 2)}%")
            lang = detect_language(tweets.iloc[i]['tweet_text'])
            if lang != 'en':
                non_english_tweets += 1
                list_non_english.append(i)

    print(f"Total tweets: {total_tweets}")
    print(f"Non-English tweets: {non_english_tweets}")


### Testing

In [6]:
df = pd.read_csv("data/cyberbullying_tweets.csv")
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [7]:

df = df_clean(df)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,mentioned_users,hashtags
0,words #hashtag food crapilicious #hashtag,not_cyberbullying,,katandandre mkr
1,#hashtag white #hashtag #hashtag #hashtag #has...,not_cyberbullying,,aussietv MKR theblock ImACelebrityAU today sun...
2,@username classy whore red velvet cupcakes,not_cyberbullying,XochitlSuckkks,
3,@username meh p thanks heads concerned another...,not_cyberbullying,Jason_Gio,
4,@username isis account pretending kurdish acco...,not_cyberbullying,RudhoeEnglish,


In [10]:
# Looking at one tweet
prototweet = df.iloc[12]["tweet_text"]
print("Class:", prototweet)

Class: Ughhhh #MKR


In [11]:
print(extract_hashtags(prototweet))

MKR


In [23]:
# Testing
fake_tweet = "Bruh what bruh @bruh #bruh: of to did  the #ruh - #hello # @hello. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]]"
print(fake_tweet, "\n")
x1 = sub_usernames(fake_tweet)
print(x1, "\n")
x2 = remove_hashtags(x1)
print(x2, "\n")
x3 = remove_punctuation(x2)
print(x3, "\n")
#x4 = remove_stopwords(x3)
x4 = removeStopWords(x3, stpWords)
print(x4, "\n")

Bruh what bruh @bruh #bruh: of to did  the #ruh - #hello # @hello. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]] 

Bruh what bruh @username #bruh: of to did  the #ruh - #hello # @username. I am is wowow, #gaming, a videos,: wtf !!! ????? ]]]]] 

Bruh what bruh @username #hashtag: of to did  the #hashtag - #hashtag # @username. I am is wowow, #hashtag, a videos,: wtf !!! ????? ]]]]] 

Bruh what bruh @username #hashtag of to did the #hashtag #hashtag @username I am is wowow #hashtag a videos wtf 

bruh bruh @username #hashtag #hashtag #hashtag @username wowow #hashtag videos wtf 



In [96]:
remove_hashtags(fake_tweet)

'Bruh bruh @bruh #hashtag: #hashtag - #hashtag # @hello. I am gaming, #hashtag, gaming,: wtf !!! ????? ]]]]]'

In [117]:
remove_punctuation(remove_hashtags(fake_tweet))

'Bruh what bruh @bruh #hashtag of to did the #hashtag #hashtag @hello I am is gaming #hashtag a gaming wtf'

In [76]:
df = pd.read_csv("data/cyberbullying_tweets.csv")
print(df["tweet_text"].iloc[1])
df = df_clean_usernames(df)
df = df_clean_hashtags(df)
df.head(10)

Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc


Unnamed: 0,tweet_text,cyberbullying_type,mentioned_users,hashtags
0,"In other words #hashtag, your food was crapili...",not_cyberbullying,,katandandre mkr
1,Why is #hashtag so white? #hashtag #hashtag #h...,not_cyberbullying,,aussietv MKR theblock ImACelebrityAU today sun...
2,@user a classy whore? Or more red velvet cupca...,not_cyberbullying,XochitlSuckkks,
3,"@user meh. :P thanks for the heads up, but no...",not_cyberbullying,Jason_Gio,
4,@user This is an ISIS account pretending to be...,not_cyberbullying,RudhoeEnglish,
5,"@user @user Yes, the test of god is that good ...",not_cyberbullying,Raja5aab Quickieleaks,
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,,
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,,mkr
8,@user everything but mostly my priest,not_cyberbullying,stockputout,
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,,
