In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv("cyberbullying_tweets.csv")
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
# Remove empty
print(data.isnull().sum())
# There are no empty values to remove

tweet_text            0
cyberbullying_type    0
dtype: int64


In [4]:
# Remove Duplicates
print(data.shape[0])
print(data["tweet_text"].duplicated().any())
data = data.drop_duplicates(subset=["tweet_text"])
print(data.shape[0])

47692
True
46017


In [5]:
# Lowercase
data["tweet_text"] = data["tweet_text"].apply(lambda x: x.lower())

# Tokenize
data["tweet_text"] = data["tweet_text"].apply(lambda x: word_tokenize(x)) # Tokenize 

# Remove non-alphanum & single-char
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word.isalnum()]) # Remove non-alphanumeric characters
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if len(word)>1]) # Remove single character 

# Remove stop words
stop_words = set(stopwords.words('english'))
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word not in stop_words]) # Remove stop words

In [6]:
# Lemmatize
lemmitizer = WordNetLemmatizer()
data["tweet_text"] = data["tweet_text"].apply(lambda x: [lemmitizer.lemmatize(word) for word in x]) # Lemmatize

# In case these things change with lemmatization
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word.isalnum()])
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if len(word)>1])

In [7]:
data.tweet_text = data.tweet_text.apply(lambda x: ' '.join(x))
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,word katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,xochitlsuckkks classy whore red velvet cupcake,not_cyberbullying
3,meh thanks head concerned another angry dude t...,not_cyberbullying
4,rudhoeenglish isi account pretending kurdish a...,not_cyberbullying


In [8]:
#delete duplicates
print(data.shape[0])
print(data["tweet_text"].duplicated().any())
data = data.drop_duplicates(subset=["tweet_text"])
print(data.shape[0])

46017
True
45247


In [9]:
grouped = data.groupby("cyberbullying_type")

print(f"Total Number of Observations: {data.shape[0]}")
print(f"Total Obs per {grouped.size()}")

Total Number of Observations: 45247
Total Obs per cyberbullying_type
age                    7887
ethnicity              7817
gender                 7704
not_cyberbullying      7814
other_cyberbullying    6066
religion               7959
dtype: int64


In [10]:
classes = data['cyberbullying_type'].unique()

In [11]:
data = data[data["cyberbullying_type"] != "other_cyberbullying"]
data = data[["tweet_text", "cyberbullying_type"]]
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,word katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,xochitlsuckkks classy whore red velvet cupcake,not_cyberbullying
3,meh thanks head concerned another angry dude t...,not_cyberbullying
4,rudhoeenglish isi account pretending kurdish a...,not_cyberbullying


In [12]:
data = data[data["tweet_text"].apply(lambda x: len(x.split()) > 0)]


In [13]:
classes = data['cyberbullying_type'].unique()
classes

array(['not_cyberbullying', 'gender', 'religion', 'age', 'ethnicity'],
      dtype=object)

In [14]:
data.iloc[307]

tweet_text            manu camping involved would show either mkr
cyberbullying_type                              not_cyberbullying
Name: 308, dtype: object

### Outline


Done:
* Imports and Data Loading
* EDA
* Preprocess Data
    * Remove Duplicates and empty values
    * Tokenize
    * Lowercase
    * Remove non-alphanumeric chars
    * remove single char words
    * Remove Stop Words
    * Lemmatize
    * Randomize order

ToDo:
* Preprocess Data
    * Padding (If we use BERT/Word2Vec)
    * Train/Valid/Test
* BERT/Word2Vec Embeddings (if we want)
* Build Net Architechture (LSTM, BiLSTM, )
* Write Train/Test Functions
* Define Hyperparameters
* Plot Results

### Preprocessing

In [15]:
data.to_csv("cyber_bully_cleaned.csv")