## Cyberbullying Dataset - Random Forest Classifier

### Read the dataset

In [3]:
import pandas as pd

data = pd.read_csv('datasets/anti-bully-data.csv', header=0)

data.head()

Unnamed: 0,label_bullying,text_message
0,0,yeah I got 2 backups for all that. I just hate...
1,0,I hate using my BB but love my iPhone. Haven'...
2,1,Get fucking real dude.
3,1,She is as dirty as they come and that crook ...
4,1,why did you fuck it up. I could do it all day...


### Exploring the data

In [25]:
# What is the shape of the dataset?
print("Input data has {} rows and {} columns".format(len(data), len(data.columns)))

Input data has 8817 rows and 2 columns


In [26]:
# How many bullying/non bullying are there?
print("Out of {} rows, {} are bullying, {} are not bullying".format(len(data),
                                                       len(data[data['label_bullying']==1]),
                                                       len(data[data['label_bullying']==0])))

Out of 8817 rows, 2505 are bullying, 6312 are not bullying


In [27]:
# How much missing data is there?
print("Number of null in label: {}".format(data['label_bullying'].isnull().sum()))
print("Number of null in text: {}".format(data['text_message'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


### Cleaning the dataset

In [4]:
import string
import re
import nltk

# TODO: Try lemmatization instead of stemming to see if accuracy is improved
# TODO: Try leaving stopwords in and see if accuracy is improved

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

data['text_message_clean'] = data['text_message'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean
0,0,yeah I got 2 backups for all that. I just hate...,yeah got 2 backup hate happen strugglin week n...
1,0,I hate using my BB but love my iPhone. Haven'...,hate use bb love iphon havent tri new bb bb pr...
2,1,Get fucking real dude.,get fuck real dude
3,1,She is as dirty as they come and that crook ...,dirti come crook rengel dem fuck corrupt joke...
4,1,why did you fuck it up. I could do it all day...,fuck could day let hour ping later sched writ...


### Split into test/train data

In [31]:
import numpy as np
import pickle

### Vectorization

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

# max_features=1500 - use top 1500 occuring words
# min_df=5 - the minimum number of items that should contain the feature
# max_df=0.7 - only those words that occur in a maximum of 70% of all the documents
count_vect = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)

x_counts = count_vect.fit_transform(data['text_message_clean'])
#print(x_counts.shape)
#print(count_vect.get_feature_names())

x_counts_df = pd.DataFrame(x_counts.toarray())
x_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
x_counts_df = tfidfconverter.fit_transform(x_counts_df).toarray()
x_counts_df

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Split into test/train data

In [42]:
from sklearn.model_selection import train_test_split
y = data.label_bullying
X_train, X_test, y_train, y_test = train_test_split(x_counts_df, y, test_size=0.2, random_state=0)

### Model Training

In [48]:
from sklearn.ensemble import RandomForestClassifier

# Could play around with hyperparameters here also to see if accuracy can be improved
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [50]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

### Model Evaluation

In [51]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1206   63]
 [ 413   82]]
              precision    recall  f1-score   support

           0       0.74      0.95      0.84      1269
           1       0.57      0.17      0.26       495

    accuracy                           0.73      1764
   macro avg       0.66      0.56      0.55      1764
weighted avg       0.69      0.73      0.67      1764

0.7301587301587301


### Saving the Trained Model

In [52]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier, picklefile)
    
    
# Model can be loaded again like this:
#with open('text_classifier', 'rb') as training_model:
    #model = pickle.load(training_model)