In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

#from sklearn.naive_bayes import MultinomialNB

In [2]:
# Loading Clean Data
df = pd.read_csv("data/cleaned/cleaned_lemmatized_english.csv")#.iloc[0:30]
print(df.shape)
# Remove empty trainnig examples
df = df[df["tweet_text"] != ""] # there are none???
print(df.shape)
df.head(10)

(44677, 6)
(44677, 6)


Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,0,word #hashtag food crapilicious #hashtag,not_cyberbullying,en,,katandandre mkr
1,1,#hashtag white #hashtag #hashtag #hashtag #has...,not_cyberbullying,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,2,@username classy whore red velvet cupcake,not_cyberbullying,en,XochitlSuckkks,
3,3,@username meh p thanks head concern angry dude...,not_cyberbullying,en,Jason_Gio,
4,4,@username isi account pretend kurdish account ...,not_cyberbullying,en,RudhoeEnglish,
5,5,@username @username yes test god good bad indi...,not_cyberbullying,en,Raja5aab Quickieleaks,
6,7,karma hope bite kat butt nasty #hashtag,not_cyberbullying,en,,mkr
7,8,@username everything mostly priest,not_cyberbullying,en,stockputout,
8,9,rebecca black drop school due bullying,not_cyberbullying,en,,
9,10,@username http co usqinyw5gn,not_cyberbullying,en,Jord_Is_Dead,


In [3]:
print(df["cyberbullying_type"].value_counts())

# Splitting the dataset by cyberbullying_types to build vocabularies
df_not = df[df["cyberbullying_type"] == "not_cyberbullying"]
df_religion = df[df["cyberbullying_type"] == "religion"]
df_age = df[df["cyberbullying_type"] == "age"]
df_gender = df[df["cyberbullying_type"] == "gender"]
df_ethn = df[df["cyberbullying_type"] == "ethnicity"]
df_other = df[df["cyberbullying_type"] == "other_cyberbullying"]
print("---------------------------------------")
print("full dataset shape: ", df.shape)
print("religion shape: ", df_religion.shape)
print("age shape: ", df_age.shape)
print("gender shape: ", df_gender.shape)
print("ethnicity shape: ", df_ethn.shape)
print("other_cyberbullying shape: ", df_other.shape)
print("not_cyberbullying shape: ", df_not.shape)

religion               7968
age                    7944
gender                 7670
ethnicity              7525
other_cyberbullying    6927
not_cyberbullying      6643
Name: cyberbullying_type, dtype: int64
---------------------------------------
full dataset shape:  (44677, 6)
religion shape:  (7968, 6)
age shape:  (7944, 6)
gender shape:  (7670, 6)
ethnicity shape:  (7525, 6)
other_cyberbullying shape:  (6927, 6)
not_cyberbullying shape:  (6643, 6)


In [4]:
# This class creates a Naive Bayes model
class NaiveBayes():
    def __init__(self):
        self.df = pd.DataFrame()#(zip(x_train, y_train), columns=["tweet_text","cb_type"])
        self.freqs = []
        self.df_list = []

    # Get training dataframe used to compute model probabilities
    def print_data(self):
        return self.df
    
    def print_freqs(self):
        return(self.freqs)

    # Separate the model dataframe by cb_type
    # for building vocab frequency tables
    def split_df(self):
        # Splitting the dataset by cyberbullying_types to build vocabularies
        df = self.df
        df_not = df[df["cb_type"] == "not_cyberbullying"]
        df_religion = df[df["cb_type"] == "religion"]
        df_age = df[df["cb_type"] == "age"]
        df_gender = df[df["cb_type"] == "gender"]
        df_ethn = df[df["cb_type"] == "ethnicity"]
        df_other = df[df["cb_type"] == "other_cyberbullying"]
        self.df_list = [df_not, df_religion, df_age, df_gender, df_ethn, df_other]
        return

    # Creates the vocab frequency dictionary
    # for building vocab probability dictionary
    def freq_dict(self, df):
        text = df["tweet_text"]
        vocab_freq = {}
        for tweet in text:
            words = str(tweet).split(" ")
            for word in words:
                # Check if word in vocabulary
                if word in vocab_freq:
                    vocab_freq[word] += 1
                else:
                    vocab_freq[word] = 1
        return vocab_freq

    # Converts vocab frequency dictionary
    # to a map between vocab and its probability
    def freq_to_prob_dict(self, vocab):
        n = np.sum(list(vocab.values()))
        for word in vocab:
            vocab[word] = vocab[word] / n
        return vocab

    # Construct all of the probability mappings
    # from a list of dataframes
    def build_freqs(self):
        freqs = []
        for i, df in enumerate(self.df_list):
            self.freqs.append(self.freq_to_prob_dict(self.freq_dict(df)))
        return

    # Load and split training data
    # Construct probability mappings
    def fit(self, x_train, y_train):
        self.df = pd.DataFrame(zip(x_train, y_train), columns=["tweet_text","cb_type"])
        self.split_df()
        self.build_freqs()
        return

    # Compute the probability scores
    # for comparison
    def get_scores(self, tweet):
        scores = np.zeros(6)
        words = str(tweet).split(" ")
        for i, freq in enumerate(self.freqs):
            log_prob = np.zeros(len(words))
            for j, word in enumerate(words):
                if word in freq:
                    log_prob[j] = np.log(freq[word])
                else:
                    log_prob[j] = np.log(0.00001)
            scores[i] = np.sum(log_prob)
        return scores

    # Get the class corresponding to
    # the largest score
    def get_class(self, scores):
        mapping = {0.0:"not",
                   1.0:"religion",
                   2.0:"age",
                   3.0:"gender",
                   4.0:"ethn",
                   5.0:"other"}
        return mapping[np.argmax(scores)]

    # Predict the classes of the test set
    def predict(self, x_test):
        preds = []
        for i, tweet in enumerate(x_test):
            scores = self.get_scores(tweet)
            preds.append(self.get_class(scores))
        return preds

In [5]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(df['tweet_text'], df['cyberbullying_type'], shuffle=True, random_state=42,
                                                   test_size=0.2)
x_test = x_test
y_test = y_test
model = NaiveBayes()
model.fit(x_train, y_train)
predictions = model.predict(x_test)

correct = np.zeros(len(y_test))
for i, (prediction, label) in enumerate(zip(predictions, y_test)):
    if prediction == label:
        correct[i] = 1
    else:
        correct[i] = 0

print(sum(correct), " / ", len(correct))
print(sum(correct) / len(correct))

4113.0  /  8936
0.4602730528200537
