## Load Dataset and Libraries

In [None]:
#install required libraries
!pip install spacy_langdetect
!pip install swifter
!pip install ekphrasis
!pip install tweet-preprocessor
!pip install emot
!pip install catboost

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import string as letter
import spacy
import swifter
import re
import preprocessor as p
import nltk
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer, roc_auc_score, classification_report
from time import time
from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from spacy_langdetect import LanguageDetector
from random import randrange
from ekphrasis.classes.segmenter import Segmenter
from functools import partial
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from catboost import Pool, CatBoostClassifier

#download nltk data for preprocessing
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#Connect to google drive data if necessary
from google.colab import drive
drive.mount('/content/drive')

In [None]:
PATH = '/content/cyberbullying_tweets.csv' #path for input csv file
df = pd.read_csv(PATH)
# remove duplicates
df = df.drop_duplicates()

##Data Exploration

In [None]:
#category distribution of data
df['cyberbullying_type'].value_counts()

# Visualize the spread of the dataset
plt.figure(figsize = (7,7))
sorted_counts = df['cyberbullying_type'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, shadow = False,
        colors = sns.color_palette("pastel"))
plt.text(x = -0.35, y = 0, s = 'Total Tweets: {}'.format(df.shape[0]))
plt.title('Distribution of Tweets in the Dataset', fontsize = 16);

In [None]:
nlp = spacy.load('en') 
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

# Explore the languages found in the tweets
def detect_language(s):
  doc = nlp(s)
  detected_lang = doc._.language
  return detected_lang['language']

df['language'] = df['tweet_text'].swifter.apply(lambda x : detect_language(x))

# Visualize the spread of the languages
plt.figure(figsize = (7,7))
sorted_counts = df['language'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, shadow = False,
        colors = sns.color_palette("pastel"))
plt.text(x = -0.35, y = 0, s = 'Total Tweets: {}'.format(df.shape[0]))
plt.title('Distribution of Languages in Tweets', fontsize = 16);

In [None]:
# for simplicity's sake, we will keep only english tweets
df = df[df['language'] == 'en']

##Preprocessing

In [None]:
#map cyberbullying types to numbers
df['cyberbullying_type_id'] = df.cyberbullying_type.map({'not_cyberbullying': 0, 'gender': 1, 'ethnicity': 2, 
                           'age': 3, 'religion': 4, 'other_cyberbullying': 5,})

In [None]:
df['cyberbullying_type_id'].value_counts()

4    7969
3    7946
1    7636
2    7521
5    6913
0    6638
Name: cyberbullying_type_id, dtype: int64

###String Manipulation

In [None]:
def contractions(string):
  # replace contractions
  s = re.sub(r"won’t", "will not", string)
  s = re.sub(r"would’t", "would not", string)
  s = re.sub(r"could’t", "could not", string)
  s = re.sub(r"cannot", "can not", string)
  s = re.sub(r"\’d", " would", string)
  s = re.sub(r"can\’t", "can not", string)
  s = re.sub(r"n\’t", " not", string)
  s = re.sub(r"\’re", " are", string)
  s = re.sub(r"\’s", " is", string)
  s = re.sub(r"\’ll", " will", string)
  s = re.sub(r"\’t", " not", string)
  s = re.sub(r"\’ve", " have", string)
  s = re.sub(r"\’m", " am", string)

  # replace accents
  string = string.replace('á', 'a')
  string = string.replace('à', 'a')
  string = string.replace('â', 'a')

  string = string.replace('é', 'e')
  string = string.replace('è', 'e')
  string = string.replace('ê', 'e')
  string = string.replace('ë', 'e')

  string = string.replace('î', 'i')
  string = string.replace('ï', 'i')

  string = string.replace('ö', 'o')
  string = string.replace('ô', 'o')
  string = string.replace('ò', 'o')
  string = string.replace('ó', 'o')

  string = string.replace('ù', 'u')
  string = string.replace('û', 'u')
  string = string.replace('ü', 'u')

  string = string.replace('ç', 'c')
  
  string = string.translate(string.maketrans('', '', letter.punctuation))

  return string

# hashtags
def deal_with_hashtags(string):
  # find hashtags
  hashtags = re.findall(r"#(\w+)", string)

  # segment hashtags
  seg_hash = []
  for i in hashtags:
    seg = seg_tw.segment(i)
    seg_hash.append(seg)

  # replace hashtag with segmented strings
  for i, v in enumerate(hashtags):
    hash = "#" + hashtags[i]
    replacement = seg_hash[i]
    string = re.sub(str(hash), str(replacement), string)
  
  return string

# ats
def deal_with_ats(string):
  # find ats
  ats = re.findall(r"@(\w+)", string)

  # segment ats
  seg_at = []
  for i in ats:
    seg = seg_tw.segment(i)
    seg_at.append(seg)

  # replace hashtag with segmented strings
  for i, v in enumerate(ats):
    at = "#" + ats[i]
    replacement = seg_at[i]
    string = re.sub(str(at), str(replacement), string)
  
  return string

# replace emojis with words
with open('/content/drive/MyDrive/DSBA/EL/Project/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(string):
    
    for emot in Emoji_Dict:
        string = string.replace(emot, " "+Emoji_Dict[emot].replace(":",""))
    return string
    
# replace emoticons with words

def convert_emoticons(string):
    for emot in EMOTICONS_EMO:
        string = string.replace(emot, " " + EMOTICONS_EMO[emot])
    return string

# Unicode
def removeUnicode(text):
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
    text = re.sub(r'[^\x00-\x7f]',r'',text)
    return text

# URLs
def replaceURL(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

# replace slang
""" Creates a dictionary with slangs and their equivalents and replaces them """
with open('/content/drive/MyDrive/DSBA/EL/Project/slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
# function
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

In [None]:
# one single function to call for string manipulation

def preprocess(df):
    # lower case
    df['pre_processed'] = df['tweet_text'].swifter.apply(lambda x: ' '.join(x.lower() for x in str(x).split()))

    # replace contractions and other replacements
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x:contractions(x))

    # hashtags
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: deal_with_hashtags(x))

    # ats
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: deal_with_ats(x))

    # remove emojis
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: convert_emojis_to_word(x))

    # remove emoticons
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: convert_emoticons(x))

    # remove Unicode
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: removeUnicode(x))

    # replace URLs
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: replaceURL(x))

    # replace slang
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: replaceSlang(x))

    # # further clean tweets
    #df['pre_processed'] = df['tweet_text'].swifter.apply(lambda x: p.clean(x))

    # remove non-alphabet characters
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))

    # remove extra spaces
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: re.sub(' +', " ", x))

    # remove stop words
    stop = stopwords.words("english")
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

    return df

In [None]:
# apply pre-processing
df2 = preprocess(df)

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

In [None]:
# store df without empty pre-processed tweets (and without NaN)
df2 = df2[df2['pre_processed'] != '']

###One-Hot Encoding

In [None]:
def contractions(string):
  # replace contractions
  s = re.sub(r"won’t", "will not", string)
  s = re.sub(r"would’t", "would not", string)
  s = re.sub(r"could’t", "could not", string)
  s = re.sub(r"cannot", "can not", string)
  s = re.sub(r"\’d", " would", string)
  s = re.sub(r"can\’t", "can not", string)
  s = re.sub(r"n\’t", " not", string)
  s = re.sub(r"\’re", " are", string)
  s = re.sub(r"\’s", " is", string)
  s = re.sub(r"\’ll", " will", string)
  s = re.sub(r"\’t", " not", string)
  s = re.sub(r"\’ve", " have", string)
  s = re.sub(r"\’m", " am", string)

  # replace accents
  string = string.replace('á', 'a')
  string = string.replace('à', 'a')
  string = string.replace('â', 'a')

  string = string.replace('é', 'e')
  string = string.replace('è', 'e')
  string = string.replace('ê', 'e')
  string = string.replace('ë', 'e')

  string = string.replace('î', 'i')
  string = string.replace('ï', 'i')

  string = string.replace('ö', 'o')
  string = string.replace('ô', 'o')
  string = string.replace('ò', 'o')
  string = string.replace('ó', 'o')

  string = string.replace('ù', 'u')
  string = string.replace('û', 'u')
  string = string.replace('ü', 'u')

  string = string.replace('ç', 'c')
  
  string = string.translate(string.maketrans('', '', letter.punctuation))

  return string

# hashtags
def deal_with_hashtags(string):
  # find hashtags
  hashtags = re.findall(r"#(\w+)", string)

  # segment hashtags
  seg_hash = []
  for i in hashtags:
    seg = seg_tw.segment(i)
    seg_hash.append(seg)

  # replace hashtag with segmented strings
  for i, v in enumerate(hashtags):
    hash = "#" + hashtags[i]
    replacement = seg_hash[i]
    string = re.sub(str(hash), str(replacement), string)
  
  return string

# ats
def deal_with_ats(string):
  # find ats
  ats = re.findall(r"@(\w+)", string)

  # segment ats
  seg_at = []
  for i in ats:
    seg = seg_tw.segment(i)
    seg_at.append(seg)

  # replace hashtag with segmented strings
  for i, v in enumerate(ats):
    at = "#" + ats[i]
    replacement = seg_at[i]
    string = re.sub(str(at), str(replacement), string)
  
  return string

# replace emojis with words
with open('/content/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(string):
    
    for emot in Emoji_Dict:
        string = string.replace(emot, " "+Emoji_Dict[emot].replace(":",""))
    return string
    
# replace emoticons with words

def convert_emoticons(string):
    for emot in EMOTICONS_EMO:
        string = string.replace(emot, " " + EMOTICONS_EMO[emot])
    return string

# Unicode
def removeUnicode(text):
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
    text = re.sub(r'[^\x00-\x7f]',r'',text)
    return text

# URLs
def replaceURL(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

# replace slang
""" Creates a dictionary with slangs and their equivalents and replaces them """
with open('/content/slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
# function
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

In [None]:
# one single function to call for string manipulation

def preprocess(df):
    # lower case
    df['pre_processed'] = df['tweet_text'].swifter.apply(lambda x: ' '.join(x.lower() for x in str(x).split()))

    # replace contractions and other replacements
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x:contractions(x))

    # hashtags
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: deal_with_hashtags(x))

    # ats
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: deal_with_ats(x))

    # remove emojis
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: convert_emojis_to_word(x))

    # remove emoticons
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: convert_emoticons(x))

    # remove Unicode
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: removeUnicode(x))

    # replace URLs
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: replaceURL(x))

    # replace slang
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: replaceSlang(x))

    # # further clean tweets
    #df['pre_processed'] = df['tweet_text'].swifter.apply(lambda x: p.clean(x))

    # remove non-alphabet characters
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))

    # remove extra spaces
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: re.sub(' +', " ", x))

    # remove stop words
    stop = stopwords.words("english")
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    df['pre_processed'] = df['pre_processed'].swifter.apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

    return df

In [None]:
# apply pre-processing
df2 = preprocess(df)

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/44623 [00:00<?, ?it/s]

In [None]:
# store df without empty pre-processed tweets (and without NaN)
df2 = df2[df2['pre_processed'] != '']

##One-Hot Encoding

In [None]:
# one hot encode the labels
# Get one hot encoding of column cyberbullying_type
one_hot = pd.get_dummies(df2['cyberbullying_type_id'])
# Join the encoded df
df2 = df2.join(one_hot)

In [None]:
df

Unnamed: 0,tweet_text,cyberbullying_type,language,cyberbullying_type_id,pre_processed
0,"In other words #katandandre, your food was cra...",not_cyberbullying,en,0,word katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,en,0,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,en,0,xochitlsuckkks classy whore red velvet cupcake
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,en,0,jasongio meh p thanks head concerned another a...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,en,0,rudhoeenglish isi account pretending kurdish a...
...,...,...,...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity,en,2,black people arent expected anything depended ...
47688,Turner did not withhold his disappointment. Tu...,ethnicity,en,2,turner withhold disappointment turner called c...
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity,en,2,swear god dumb nigger bitch got bleach hair re...
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity,en,2,yea fuck rt therealexel youre nigger fucking u...


In [None]:
# save full df
df2.to_csv('/content/v2.csv')

In [None]:
# load full pre-processed df
df2 = pd.read_csv('/content/v2.csv')

In [None]:
df2

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,language,cyberbullying_type_id,pre_processed,0,1,2,3,4,5
0,0,"In other words #katandandre, your food was cra...",not_cyberbullying,en,0,word katandandre food crapilicious mkr,1,0,0,0,0,0
1,1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,en,0,aussietv white mkr theblock imacelebrityau tod...,1,0,0,0,0,0
2,2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,en,0,xochitlsuckkks classy whore red velvet cupcake,1,0,0,0,0,0
3,3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,en,0,jasongio meh p thanks head concerned another a...,1,0,0,0,0,0
4,4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,en,0,rudhoeenglish isi account pretending kurdish a...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
44615,47687,"Black ppl aren't expected to do anything, depe...",ethnicity,en,2,black people arent expected anything depended ...,0,0,1,0,0,0
44616,47688,Turner did not withhold his disappointment. Tu...,ethnicity,en,2,turner withhold disappointment turner called c...,0,0,1,0,0,0
44617,47689,I swear to God. This dumb nigger bitch. I have...,ethnicity,en,2,swear god dumb nigger bitch got bleach hair re...,0,0,1,0,0,0
44618,47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity,en,2,yea fuck rt therealexel youre nigger fucking u...,0,0,1,0,0,0


In [None]:
y_cb = df2['cyberbullying_type'] # string categories for catboost
cat_features = list(df2['cyberbullying_type'].unique())

In [None]:
# drop unnamed columns
df_input = df2.iloc[:, 4:]

In [None]:
# save df after pre-processing for easy loading

df_input.to_csv('/content/input2.csv')

In [None]:
df_input

Unnamed: 0,cyberbullying_type_id,pre_processed,0,1,2,3,4,5
0,0,word katandandre food crapilicious mkr,1,0,0,0,0,0
1,0,aussietv white mkr theblock imacelebrityau tod...,1,0,0,0,0,0
2,0,xochitlsuckkks classy whore red velvet cupcake,1,0,0,0,0,0
3,0,jasongio meh p thanks head concerned another a...,1,0,0,0,0,0
4,0,rudhoeenglish isi account pretending kurdish a...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
44615,2,black people arent expected anything depended ...,0,0,1,0,0,0
44616,2,turner withhold disappointment turner called c...,0,0,1,0,0,0
44617,2,swear god dumb nigger bitch got bleach hair re...,0,0,1,0,0,0
44618,2,yea fuck rt therealexel youre nigger fucking u...,0,0,1,0,0,0


###Train & Test Datasets

In [None]:
X = df_input.iloc[:, 1]
y = df_input.iloc[:, 0]
y_encoded = df_input.iloc[:,2:]

In [None]:
# separate X and Y data - THIS NEEDS TO BE CHANGED
X_data = pd.Series(df.iloc[:, 0])#--------------------- X
y_data = df.iloc[:, 3:] # this is for OneVsRestClassifier + Logistic Regression#--------------- y_encoded
y_data_2 = df.iloc[:, 1] # not hot one encoded (which is needed for other methods)#------------ y

In [None]:
# with validation, hot encoded
X_temp, X_test, y_temp, y_test = train_test_split(X, y_encoded, test_size=0.2, train_size=0.8)
X_train, X_val, y_train, y_val = train_test_split(X_temp,y_temp,test_size = 0.25,train_size =0.75)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_val.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 44623
Number of rows in the training set: 26772
Number of rows in the validation set: 8924
Number of rows in the test set: 8924


In [None]:
# without validation, hot encoded
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, train_size=0.8)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 44623
Number of rows in the training set: 35696
Number of rows in the test set: 8924


In [None]:
# without validation - NOT ONE HOT ENCODED
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 44623
Number of rows in the training set: 35696
Number of rows in the test set: 8924


##TF IDF & Feature Extraction

In [None]:
# TD IDF vectorizer WITH VALIDATION
print("TFIDF Vectorizer……")

def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test set and return the result
     
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

print(" done!")

TFIDF Vectorizer……
 done!


In [None]:
# TD IDF vectorizer WITH NO VALIDATION
print("TFIDF Vectorizer……")

def tfidf_features(X_train, X_test):
    """
        X_train, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test set and return the result
      
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return X_train_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

print(" done!")

TFIDF Vectorizer……
 done!


##Models

###Bagging

####Parameter Tuning

In [None]:
max_sample = [0.5, 0.7, 1.0, 2]
max_features = [0.5, 0.7, 1.0, 2]
param_grid = {'max_features': max_features, 'max_samples': max_sample}
grid = GridSearchCV(BaggingClassifier(random_state=123), param_grid=param_grid ,verbose=3)
grid.fit(X_train_tfidf, y_train)
grid.predict(X_test_tfidf)
print(grid.best_score_)
print(grid.best_params_)

In [None]:
bagging = BaggingClassifier(random_state=123, max_samples=1.0, max_features=0.7)
bagging.fit(X_train_tfidf, y_train)
y_test_pred = bagging.predict(X_test_tfidf)
score = bagging.score(X_test_tfidf, y_test)
print(score)
report = classification_report(y_test, y_test_pred, output_dict=True)
report

###GradientBoost

####Parameter Tuning

###SGD

####Parameter Tuning

In [None]:
param_grid = {
  'penalty': ['l2','l1','elasticnet'],
  'alpha': [0.0001, 0.001, 0.01, 0.1]
}
grid = GridSearchCV(SGDClassifier(random_state=123),  param_grid=param_grid, cv=3, verbose=3)
grid.fit(X_train_tfidf, y_train)
y_test_pred = grid.predict(X_test_tfidf)
score = grid.score(X_test_tfidf, y_test)
print(score)
print(grid.best_score_)
print(grid.best_params_)

report = classification_report(y_test, y_test_pred, output_dict=True)
report

###AdaBoost

####Parameter Tuning

In [None]:
#test values for number of estimators
param_grid = {
  'n_estimators': [20, 50, 100, 150, 200]
}

grid = GridSearchCV(AdaBoostClassifier(random_state=123),  param_grid=param_grid, cv=3, verbose=3)
grid.fit(X_train_tfidf, y_train)
y_test_pred = grid.predict(X_test_tfidf)
score = grid.score(X_test_tfidf, y_test)
print(score)
print(grid.best_score_)
print(grid.best_params_)

report = classification_report(y_test, y_test_pred, output_dict=True)
report

###Decision Tree

####Parameter Tuning

In [None]:
param_grid = {
  'criterion': ['gini','entropy'],
  'max_depth': [2,4,6,8,10,12,15,20,30,40,50,None]
}
grid = GridSearchCV(DecisionTreeClassifier(random_state=123),  param_grid=param_grid, cv=3, verbose=3)
grid.fit(X_train_tfidf, y_train)
y_test_pred = grid.predict(X_test_tfidf)
score = grid.score(X_test_tfidf, y_test)
print(score)
print(grid.best_score_)
print(grid.best_params_)

report = classification_report(y_test, y_test_pred, output_dict=True)
report

###Random Forest

####Parameter Tuning

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [20, 30, 40, 50, 100],
    'max_features': [3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8, 10],
    'n_estimators': [100, 500, 1000],
    'criterion': ['entropy']
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = 5, verbose = True)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_tfidf, y_train)
grid_search.best_params_
# {'bootstrap': True,
#  'criterion': 'entropy',
#  'max_depth': 100,
#  'max_features': 5,
#  'min_samples_leaf': 1,
#  'min_samples_split': 2,
#  'n_estimators': 1000}

Fitting 3 folds for each of 2250 candidates, totalling 6750 fits




{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 100,
 'max_features': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
# pick best model
best_rf = grid_search.best_estimator_

In [None]:
# load previously saved model
import pickle
filename = '/content/best_rfc_final1.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
# fit model to trainning data
best_rf.fit(X_train_tfidf, y_train)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    1.6s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:    7.2s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:   16.6s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:   30.1s
[Parallel(n_jobs=5)]: Done 1000 out of 1000 | elapsed:   37.6s finished


RandomForestClassifier(criterion='entropy', max_depth=100, max_features=5,
                       n_estimators=1000, n_jobs=5, verbose=True)

In [None]:
# predicting the test data
y_test_pred = best_rf.predict(X_test_tfidf)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    0.1s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:    0.2s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:    0.5s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:    0.9s
[Parallel(n_jobs=5)]: Done 1000 out of 1000 | elapsed:    1.2s finished


In [None]:
# analyze results
report = classification_report(y_test, y_test_pred, output_dict=True)
report

{'0': {'f1-score': 0.4048780487804878,
  'precision': 0.532051282051282,
  'recall': 0.32677165354330706,
  'support': 1270},
 '1': {'f1-score': 0.8595446584938705,
  'precision': 0.9002201027146002,
  'recall': 0.8223860589812333,
  'support': 1492},
 '2': {'f1-score': 0.9528013582342955,
  'precision': 0.9622770919067215,
  'recall': 0.9435104236718225,
  'support': 1487},
 '3': {'f1-score': 0.9244368406045053,
  'precision': 0.8645333333333334,
  'recall': 0.9932598039215687,
  'support': 1632},
 '4': {'f1-score': 0.9238095238095237,
  'precision': 0.8748010610079575,
  'recall': 0.9786350148367953,
  'support': 1685},
 '5': {'f1-score': 0.5764342150463758,
  'precision': 0.5385109114249037,
  'recall': 0.6201034737620104,
  'support': 1353},
 'accuracy': 0.8021078596255186,
 'macro avg': {'f1-score': 0.7736507741615098,
  'precision': 0.7787322970731331,
  'recall': 0.7807777381194562,
  'support': 8919},
 'weighted avg': {'f1-score': 0.7914190869561406,
  'precision': 0.7919388954

###CatBoost

####Parameter Tuning

In [None]:
# Set up Grid Search
CB = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.1, 
    depth=2,
    l2_leaf_reg = 10,
    loss_function='MultiClassOneVsAll',
    task_type="GPU"
)

grid_dataset = Pool(data=X_train_tfidf,
                  label=y_train)

grid = {'learning_rate': [0.1, 0.6, 0.7, 0.9, 1],
        'depth': [5, 10, 6, 4, 3],
        'l2_leaf_reg': [5, 10, 8, 3]}

grid_search_result = CB.grid_search(grid, 
                                     grid_dataset,
                                     cv=3,
                                     stratified=True,
                                     refit=True,
                                     plot=False)

In [None]:
# Shows us the best parameters after the grid search
best_parameters = grid_search_result['params']
best_depth = best_parameters['depth']
best_learning_rate = best_parameters['learning_rate']
best_leaf = best_parameters['l2_leaf_reg']
print(best_depth, best_learning_rate, best_leaf)

# parameters and performance
# with MultiClassOneVsAll: 10 1 5; weighted avg': {'f1-score': 0.8146414039211765,
# with MultiClass: 

###OnevsRest 

####Parameter Tuning

In [None]:
# WITH VALIDATION
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

# classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 4, regularisation = 'l2')

# Predict labels of validation
y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))
    

# Validation Set
print('\nTfidf\n')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

# Predict labels of test
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)
y_test_predicted_scores_tfidf = classifier_tfidf.decision_function(X_test_tfidf)

# Test Set
print('\nTfidf\n')
print_evaluation_scores(y_test, y_test_predicted_labels_tfidf)

In [None]:
# WITH NO VALIDATION
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

# classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 4, regularisation = 'l2')


from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))
    


# Predict labels of test
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)
y_test_predicted_scores_tfidf = classifier_tfidf.decision_function(X_test_tfidf)

# Test Set
print('\nTfidf\n')
print_evaluation_scores(y_test, y_test_predicted_labels_tfidf)

In [None]:
# analyze results
from sklearn.metrics import classification_report
report = classification_report(y_test, y_test_predicted_labels_tfidf, output_dict=True)
report