In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
data = pd.read_csv("cyberbullying_tweets.csv")
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [20]:
grouped = data.groupby("cyberbullying_type")

print(f"Total Number of Observations: {data.shape[0]}")
print(f"Total Obs per {grouped.size()}")

Total Number of Observations: 47692
Total Obs per cyberbullying_type
age                    7992
ethnicity              7961
gender                 7973
not_cyberbullying      7945
other_cyberbullying    7823
religion               7998
dtype: int64


In [21]:
classes = data['cyberbullying_type'].unique()
classes

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

### Outline


Done:
* Imports and Data Loading
* EDA
* Preprocess Data
    * Remove Duplicates and empty values
    * Tokenize
    * Lowercase
    * Remove non-alphanumeric chars
    * remove single char words
    * Remove Stop Words
    * Lemmatize
    * Randomize order

ToDo:
* Preprocess Data
    * Padding (If we use BERT/Word2Vec)
    * Train/Valid/Test
* BERT/Word2Vec Embeddings (if we want)
* Build Net Architechture (LSTM, BiLSTM, )
* Write Train/Test Functions
* Define Hyperparameters
* Plot Results

### Preprocessing

In [22]:
# Remove empty
print(data.isnull().sum())
# There are no empty values to remove

tweet_text            0
cyberbullying_type    0
dtype: int64


In [23]:
# Remove Duplicates
print(data.shape[0])
print(data["tweet_text"].duplicated().any())
data = data.drop_duplicates(subset=["tweet_text"])
print(data.shape[0])

47692
True
46017


In [24]:
# Lowercase
data["tweet_text"] = data["tweet_text"].apply(lambda x: x.lower())

# Tokenize
data["tweet_text"] = data["tweet_text"].apply(lambda x: word_tokenize(x))

# Remove non-alphanum & single-char
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word.isalnum()])
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if len(word)>1])

# Remove stop words
stop_words = set(stopwords.words('english'))
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet_text"] = data["tweet_text"].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet_text"] = data["tweet_text"].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet_text"] = data["tweet_text"].apply(lambda x: [word fo

In [25]:
# Lemmatize
lemmitizer = WordNetLemmatizer()
data["tweet_text"] = data["tweet_text"].apply(lambda x: [lemmitizer.lemmatize(word) for word in x])

# In case these things change with lemmatization
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word.isalnum()])
data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if len(word)>1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet_text"] = data["tweet_text"].apply(lambda x: [lemmitizer.lemmatize(word) for word in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet_text"] = data["tweet_text"].apply(lambda x: [word for word in x if word.isalnum()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["twe

In [26]:
# data is initially organized in order of class, which will not be suitable for train/valid/test split
data = data.sample(frac=1, replace=False, random_state=42)
data.head(10)

Unnamed: 0,tweet_text,cyberbullying_type
26156,"[andyeaston85, love, teaching, bullshitters, l...",other_cyberbullying
15596,"[gyuk, youtuber, double, vile, jess, phillips,...",gender
19412,"[islam, declared, war, mankind, 1400, year, ag...",religion
44068,"[broconfessional, asheton3195, weak, fuck, lma...",ethnicity
7305,"[kat, andre, behaving, spiteful, vindictive, m...",not_cyberbullying
6137,"[hear, snowing, north, glad, made, snow, started]",not_cyberbullying
13332,"[real, men, call, woman, bitch, really, sorry,...",gender
26754,"[oshit, bully, part, ok, lol, goin, try, slid,...",other_cyberbullying
36065,"[jazzruma, must, talk, ella, sha, made, bigges...",age
23788,"[call, twitter, allow, best, apt, word, idiot,...",religion


In [31]:
data.to_csv("cyber_bully_cleaned.csv")

### Network