<div class="alert" style="background-color:#fff; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:15px 15px; color:#88d8b0; font-size:40px'>5.1 Classification Model - Cyberbullying vs Not Cyberbullying</h1>
</div>

<div class="alert alert-info" style="background-color:#88d8b0; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Import Libraries or Modules </h2>
</div>

In [3]:
# Import Dependencies
%matplotlib inline

# Begin Python Imports
import datetime, warnings, scipy
warnings.filterwarnings("ignore")
import pickle
import os
import glob

# Data Manipulation
import numpy as np
import pandas as pd
from scipy import sparse
pd.set_option('display.max_columns', None)

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Progress bar
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    StratifiedShuffleSplit,
    GridSearchCV,
    cross_val_score
)

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score, 
    accuracy_score, 
    confusion_matrix, 
    classification_report, 
    plot_confusion_matrix,
    plot_precision_recall_curve
)

from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from sklearn.preprocessing import MaxAbsScaler
from imblearn.over_sampling import SMOTE 



<div class="alert alert-info" style="background-color:#88d8b0; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Train and Test Classifier</h2>
</div>

In [4]:
# Instantiate classifier
scaler = MaxAbsScaler()

lr = LogisticRegression(n_jobs=-1)
svc = LinearSVC(random_state=1127)

In [5]:
##############################
# Train and Test Classifiers #
##############################
def automate_result(df='bully_data_clean_with_stopword',sampling='original',sampling_ratio=1):
    
    ####################
    # Reset Processing #
    ####################
    # first check whether file exists or not
    # calling remove method to delete the csv file
    # in remove method you need to pass file name and type
    
    task = 'bully_binary_classification'
    file = task + '/' + df + '/results/results_' + sampling + '_sample.csv'
    #file = df + '/results/results_all.csv'
    if(os.path.exists(file) and os.path.isfile(file)):
        os.remove(file)
        print("File deleted")
    else:
        print("File cleared")
     
    
    
    ########################
    # Train and Test Model #
    ########################
    # Note    
    # classifier_name and pipeline
    # feature_name and X
    
    def run_model(classifier_name, feature_name, splits, X, Y, pipeline, average_method,target_label):
        
        # Instantiate 
        # kfold = StratifiedShuffleSplit(n_splits=splits, test_size=0.1, random_state=1127)
        kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1127)
        accuracy = []
        precision = []
        recall = []
        f1 = []
        auc = []
        
        record_cols = ["sampling_method","classifier","feature",
                       "accuracy","accuracy_std",
                       "precision","precision_std",
                       "recall","recall_std",
                       "f1","f1_std",
                       "auc","auc_std"]
                
             
        # Run cross-validation
        print("["+ sampling + ", " + classifier_name + "] Developing Model and Generating Metrics for features: " + feature_name)
        for train, test in tqdm(kfold.split(X, Y)):

            # Train and fit model
            model_fit = pipeline.fit(X[train], Y[train])
            prediction = model_fit.predict(X[test])

            # Compute metrics
            scores = model_fit.score(X[test],Y[test])
            accuracy.append(scores * 100)
            if target_label == None:
                precision.append(precision_score(Y[test], prediction, average=average_method)*100)
                recall.append(recall_score(Y[test], prediction, average=average_method)*100)
                f1.append(f1_score(Y[test], prediction, average=average_method)*100)
                
                if classifier_name == "RandomForest":
                    auc.append(roc_auc_score(Y[test], model_fit.predict_proba(X[test])[:,1], average=None)*100)  
                else:
                    auc.append(roc_auc_score(Y[test], model_fit.decision_function(X[test]), average=None)*100)
            else:
                precision.append(precision_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                recall.append(recall_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                f1.append(f1_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                if classifier_name == "RandomForest":
                    auc.append(roc_auc_score(Y[test], model_fit.predict_proba(X[test])[:,1], average=None)*100)  
                else:
                    auc.append(roc_auc_score(Y[test], model_fit.decision_function(X[test]), average=None)*100)
    
                
        record = zip([sampling],
                     [classifier_name], [feature_name],
                     [np.mean(accuracy)], [np.std(accuracy)],
                     [np.mean(precision)], [np.std(precision)] ,               
                     [np.mean(recall)], [np.std(recall)] ,               
                     [np.mean(f1)], [np.std(f1)],               
                     [np.mean(auc)], [np.std(auc)]                
                    ) 

        df = pd.DataFrame(record, columns=record_cols)
        
        df.to_csv(file,mode='a', header=(index==0))

        
        
    #########################
    # Classifier Dictionary #
    #########################

    classifier_dict = { 'LogisticRegression': lr,
                        'LibSVC': svc
                        }
    
        
    
    #######################
    # Features Dictionary #
    #######################
    # Load Pickle files for X feature vectors
    
    path = task + '\\' + df + '\\features\\selected'
    all_files = glob.glob(path + "/X*.pkl")
    feature_dict = {}

    for file_ in all_files:
        # temp = file_.split('\\')[-1].split('.')[0]
        temp = file_.split('\\')[-1].split('.')[0].split("_")[-1] # e.g X_AllTextual.pkl
        
        with open(file_,'rb') as f:
            x = pickle.load(f)
            feature_dict[temp]  = x
    
    
    
    ################
    # Target Label #
    ################
    # Load Pickle file for Y label
    
    with open(task + '\\' + df + '\\target_class\\Y_cyberbullying.pkl','rb') as f:
        Y_label = pickle.load(f)
    
    
    
    ########################
    # Run through the loop #
    ########################
   
    index = 0 # Initialization
    for classifier in classifier_dict.keys():
        
        # Selection of Pipeline by sampling method 
        if sampling == "original":
            selected_pipeline = Pipeline([ 
                                          ('scaler',scaler),
                                          ('classifier', classifier_dict[classifier])])

        elif sampling == "oversampling":
            selected_pipeline =  make_pipeline(scaler,
                                               RandomOverSampler(random_state=1127,sampling_strategy=sampling_ratio),
                                               classifier_dict[classifier])
            
        elif sampling == "smote":
            selected_pipeline =  make_pipeline(scaler,
                                               SMOTE(random_state=1127,sampling_strategy=sampling_ratio),
                                               classifier_dict[classifier])     
        elif sampling == "downsampling":
            selected_pipeline = make_pipeline(scaler,
                                              RandomUnderSampler(random_state=1127,sampling_strategy=sampling_ratio),
                                              classifier_dict[classifier])

        for feature in tqdm(feature_dict.keys()):
            X_feature = feature_dict[feature]
            run_model(classifier_name=classifier, 
                      feature_name=feature, 
                      splits=10, 
                      X=X_feature, 
                      Y=Y_label, 
                      pipeline = selected_pipeline, 
                      average_method = 'binary', # macro for multiclass, binary for binary classification
                      target_label = 'Cyberbullying') # Specify Cyberbullying for binary classification
            index = index + 1
            print()

In [6]:
############################################
# Data 1: 'bully_data_clean_with_stopword'
###########################################

automate_result(df='bully_data_clean_with_stopword',sampling='original')

File cleared


  0%|          | 0/1 [00:00<?, ?it/s]

[original, LogisticRegression] Developing Model and Generating Metrics for features: TermListsRationew



0it [00:00, ?it/s][A
1it [00:06,  6.80s/it][A
2it [00:09,  4.18s/it][A
3it [00:11,  3.30s/it][A
4it [00:13,  2.94s/it][A
5it [00:16,  2.79s/it][A
6it [00:18,  2.58s/it][A
7it [00:20,  2.43s/it][A
8it [00:22,  2.37s/it][A
9it [00:25,  2.34s/it][A
10it [00:27,  2.74s/it][A
100%|██████████| 1/1 [00:27<00:00, 27.38s/it]





  0%|          | 0/1 [00:00<?, ?it/s]

[original, LibSVC] Developing Model and Generating Metrics for features: TermListsRationew



0it [00:00, ?it/s][A
1it [00:01,  1.48s/it][A
2it [00:02,  1.35s/it][A
3it [00:03,  1.28s/it][A
4it [00:05,  1.24s/it][A
5it [00:06,  1.26s/it][A
6it [00:07,  1.22s/it][A
7it [00:08,  1.22s/it][A
8it [00:10,  1.25s/it][A
9it [00:11,  1.27s/it][A
10it [00:12,  1.27s/it][A
100%|██████████| 1/1 [00:12<00:00, 12.70s/it]







In [7]:
automate_result(df='bully_data_clean_with_stopword',sampling='smote',sampling_ratio=0.15)

File cleared


  0%|          | 0/1 [00:00<?, ?it/s]

[smote, LogisticRegression] Developing Model and Generating Metrics for features: TermListsRationew



0it [00:00, ?it/s][A
1it [00:03,  3.58s/it][A
2it [00:07,  3.58s/it][A
3it [00:10,  3.64s/it][A
4it [00:14,  3.57s/it][A
5it [00:17,  3.53s/it][A
6it [00:21,  3.45s/it][A
7it [00:23,  3.13s/it][A
8it [00:25,  2.90s/it][A
9it [00:28,  2.76s/it][A
10it [00:30,  3.10s/it][A
100%|██████████| 1/1 [00:30<00:00, 30.97s/it]





  0%|          | 0/1 [00:00<?, ?it/s]

[smote, LibSVC] Developing Model and Generating Metrics for features: TermListsRationew



0it [00:00, ?it/s][A
1it [00:02,  2.86s/it][A
2it [00:05,  2.74s/it][A
3it [00:08,  2.86s/it][A
4it [00:11,  2.84s/it][A
5it [00:14,  2.94s/it][A
6it [00:17,  2.89s/it][A
7it [00:20,  2.92s/it][A
8it [00:22,  2.80s/it][A
9it [00:25,  2.83s/it][A
10it [00:28,  2.84s/it][A
100%|██████████| 1/1 [00:28<00:00, 28.40s/it]





