## Load Dataset and Libraries

In [None]:
#install required libraries
!pip install spacy_langdetect
!pip install swifter
!pip install ekphrasis
!pip install tweet-preprocessor
!pip install emot
!pip install catboost

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import string as letter
import spacy
import swifter
import re
import preprocessor as p
import nltk
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer, roc_auc_score, classification_report
from time import time
from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from spacy_langdetect import LanguageDetector
from random import randrange
from ekphrasis.classes.segmenter import Segmenter
from functools import partial
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from catboost import Pool, CatBoostClassifier

#download nltk data for preprocessing
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#Connect to google drive data if necessary
from google.colab import drive
drive.mount('/content/drive')

In [None]:
PATH = '/content/cyberbullying_tweets.csv' #path for input csv file
df = pd.read_csv(PATH)
# remove duplicates
df = df.drop_duplicates()

##Data Exploration

In [None]:
#category distribution of data
df['cyberbullying_type'].value_counts()

# Visualize the spread of the dataset
plt.figure(figsize = (7,7))
sorted_counts = df['cyberbullying_type'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, shadow = False,
        colors = sns.color_palette("pastel"))
plt.text(x = -0.35, y = 0, s = 'Total Tweets: {}'.format(df.shape[0]))
plt.title('Distribution of Tweets in the Dataset', fontsize = 16);

In [None]:
nlp = spacy.load('en') 
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

# Explore the languages found in the tweets
def detect_language(s):
  doc = nlp(s)
  detected_lang = doc._.language
  return detected_lang['language']

df['language'] = df['tweet_text'].swifter.apply(lambda x : detect_language(x))

# Visualize the spread of the languages
plt.figure(figsize = (7,7))
sorted_counts = df['language'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, shadow = False,
        colors = sns.color_palette("pastel"))
plt.text(x = -0.35, y = 0, s = 'Total Tweets: {}'.format(df.shape[0]))
plt.title('Distribution of Languages in Tweets', fontsize = 16);

In [None]:
# for simplicity's sake, we will keep only english tweets
df = df[df['language'] == 'en']

##Preprocessing

###String Manipulation

###One-Hot Encoding

###Train & Test Datasets

##TF IDF & Feature Extraction

##Models

###Bagging

####Parameter Tuning

###GradientBoost

####Parameter Tuning

###SGD

####Parameter Tuning

###AdaBoost

####Parameter Tuning

In [None]:
#test values for number of estimators
param_grid = {
  'n_estimators': [20, 50, 100, 150, 200]
}

grid = GridSearchCV(AdaBoostClassifier(random_state=123),  param_grid=param_grid, cv=3, verbose=3)
grid.fit(X_train_tfidf, y_train)
y_test_pred = grid.predict(X_test_tfidf)
score = grid.score(X_test_tfidf, y_test)
print(score)
print(grid.best_score_)
print(grid.best_params_)

report = classification_report(y_test, y_test_pred, output_dict=True)
report

###Decision Tree

####Parameter Tuning

In [None]:
param_grid = {
  'criterion': ['gini','entropy'],
  'max_depth': [2,4,6,8,10,12,15,20,30,40,50,None]
}
grid = GridSearchCV(DecisionTreeClassifier(random_state=123),  param_grid=param_grid, cv=3, verbose=3)
grid.fit(X_train_tfidf, y_train)
y_test_pred = grid.predict(X_test_tfidf)
score = grid.score(X_test_tfidf, y_test)
print(score)
print(grid.best_score_)
print(grid.best_params_)

report = classification_report(y_test, y_test_pred, output_dict=True)
report

###Random Forest

####Parameter Tuning

###CatBoost

####Parameter Tuning

###OnevsRest 

####Parameter Tuning