<div class="alert" style="background-color:#fff; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:15px 15px; color:#7bc043; font-size:40px'>5.2 Classification Model - Role (Multiclass) </h1>
</div>

<div class="alert alert-info" style="background-color:#7bc043; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Import Libraries or Modules </h2>
</div>

In [1]:
# Import Dependencies
%matplotlib inline

# Begin Python Imports
import datetime, warnings, scipy
warnings.filterwarnings("ignore")
import pickle
import os
import glob

# Data Manipulation
import numpy as np
import pandas as pd
from scipy import sparse
pd.set_option('display.max_columns', None)

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Progress bar
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    StratifiedShuffleSplit,
    GridSearchCV,
    cross_val_score
)

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score, 
    accuracy_score, 
    confusion_matrix, 
    classification_report, 
    plot_confusion_matrix,
    plot_precision_recall_curve
)

from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MaxAbsScaler

<div class="alert alert-info" style="background-color:#7bc043; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Train and Test Classifier</h2>
</div>

In [2]:
# Instantiate classifier
scaler = MaxAbsScaler()

lr = LogisticRegression(n_jobs=-1,multi_class='multinomial')
svc = LinearSVC(random_state=1127,multi_class='ovr')

In [3]:
# mnb = MultinomialNB()
# xgb = XGBClassifier(tree_method='gpu_hist', random_state=1127)

# # get a stacking ensemble of models
# def get_stacking():
#     # define the base models
#     level0 = list()
#     level0.append(('LogisticRegression', lr))
#     level0.append(('LibSVC', svc))
#     # level0.append(('MultinomialNaiveBayes', mnb))
#     # define meta learner model
#     level1 = lr
#     # define the stacking ensemble
#     model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, n_jobs=-1)
#     return model

# stacking = get_stacking()

In [4]:
##############################
# Train and Test Classifiers #
##############################
def automate_result(df='amica_data_clean_with_stopword',sampling='original',sampling_ratio=1):
    
    ####################
    # Reset Processing #
    ####################
    # first check whether file exists or not
    # calling remove method to delete the csv file
    # in remove method you need to pass file name and type
    
    task = 'amica_role_classification'
    file = task + '/' + df + '/results/results_' + sampling + '_sample.csv'
    #file = df + '/results/results_all.csv'
    if(os.path.exists(file) and os.path.isfile(file)):
        os.remove(file)
        print("File deleted")
    else:
        print("File cleared")
     
    
    ########################
    # Train and Test Model #
    ########################
    # Note    
    # classifier_name and pipeline
    # feature_name and X
    
    def run_model(classifier_name, feature_name, splits, X, Y, pipeline, average_method,target_label):
        
        # Instantiate 
        # kfold = StratifiedShuffleSplit(n_splits=splits, test_size=0.1, random_state=1127)
        kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=1127)
        accuracy = []
        precision = []
        recall = []
        f1 = []
        auc = []
        
        record_cols = ["sampling_method","classifier","feature",
                       "accuracy","accuracy_std",
                       "precision","precision_std",
                       "recall","recall_std",
                       "f1","f1_std",
                       "auc","auc_std"]
                
             
        # Run cross-validation
        print("["+ sampling + ", " + classifier_name + "] Developing Model and Generating Metrics for features: " + feature_name)
        for train, test in tqdm(kfold.split(X, Y)):

            # Train and fit model
            model_fit = pipeline.fit(X[train], Y[train])
            prediction = model_fit.predict(X[test])

            # Compute metrics
            scores = model_fit.score(X[test],Y[test])
            accuracy.append(scores * 100)
            if target_label == None:
                precision.append(precision_score(Y[test], prediction, average=average_method)*100)
                recall.append(recall_score(Y[test], prediction, average=average_method)*100)
                f1.append(f1_score(Y[test], prediction, average=average_method)*100)
                #auc.append(roc_auc_score(Y[test], model_fit.decision_function(X[test]), average=None)*100)
            else:
                precision.append(precision_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                recall.append(recall_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                f1.append(f1_score(Y[test], prediction, average=average_method, pos_label=target_label)*100)
                #auc.append(roc_auc_score(Y[test], model_fit.decision_function(X[test]), average=average_method)*100)
                
        record = zip([sampling],
                     [classifier_name], [feature_name],
                     [np.mean(accuracy)], [np.std(accuracy)],
                     [np.mean(precision)], [np.std(precision)] ,               
                     [np.mean(recall)], [np.std(recall)] ,               
                     [np.mean(f1)], [np.std(f1)],               
                     [np.mean(auc)], [np.std(auc)]                
                    ) 

        df = pd.DataFrame(record, columns=record_cols)
        
        df.to_csv(file,mode='a', header=(index==0))

        
        
    #########################
    # Classifier Dictionary #
    #########################

    classifier_dict = { 'LogisticRegression': lr,
                        'LibSVC': svc
                        
                        }
    
        
    
    #######################
    # Features Dictionary #
    #######################
    # Load Pickle files for X feature vectors
    feature_location = 'amica_binary_classification'
    path = feature_location + '\\' + df + '\\features\\selected'
    all_files = glob.glob(path + "/X*.pkl")
    feature_dict = {}

    for file_ in all_files:
        # temp = file_.split('\\')[-1].split('.')[0]
        temp = file_.split('\\')[-1].split('.')[0].split("_")[-1] # e.g X_AllTextual.pkl
        
        with open(file_,'rb') as f:
            x = pickle.load(f)
            feature_dict[temp]  = x
    
    
    
    ################
    # Target Label #
    ################
    # Load Pickle file for Y label
    
    with open(task + '\\' + df + '\\target_class\\Y_role.pkl','rb') as f:
        Y_label = pickle.load(f)
    df= pd.DataFrame(list(Y_label))
    print(df[0].value_counts())
    
    
    ########################
    # Run through the loop #
    ########################
   
    index = 0 # Initialization
    for classifier in classifier_dict.keys():
        
        # Selection of Pipeline by sampling method 
        if sampling == "original":
            selected_pipeline = Pipeline([
                                          ('classifier', classifier_dict[classifier])])

        elif sampling == "oversampling":
            selected_pipeline =  make_pipeline( 
                                               RandomOverSampler(random_state=1127,sampling_strategy=sampling_ratio),
                                               scaler,
                                               classifier_dict[classifier])
            
        elif sampling == "smote":
            selected_pipeline =  make_pipeline(
                                               SMOTE(random_state=1127,sampling_strategy=sampling_ratio),
                                               scaler,
                                               classifier_dict[classifier])     
        elif sampling == "downsampling":
            selected_pipeline = make_pipeline( 
                                              RandomUnderSampler(random_state=1127,sampling_strategy=sampling_ratio),
                                              scaler,
                                              classifier_dict[classifier])

        for feature in tqdm(feature_dict.keys()):
            X_feature = feature_dict[feature]
            run_model(classifier_name=classifier, 
                      feature_name=feature, 
                      splits=10, 
                      X=X_feature, 
                      Y=Y_label, 
                      pipeline = selected_pipeline, 
                      average_method = 'macro', # macro for multiclass, binary for binary classification
                      target_label = None) # Specify Cyberbullying for binary classification
            index = index + 1
            print()

In [5]:
############################################
# Data 1: 'amica_data_clean_with_stopword'
############################################

automate_result(df='amica_data_clean_with_stopword',sampling='downsampling',sampling_ratio={0:33363})

File deleted
0    106872
1      3596
2      1354
3       425
Name: 0, dtype: int64


  0%|          | 0/2 [00:00<?, ?it/s]

[downsampling, LogisticRegression] Developing Model and Generating Metrics for features: CountVecWordCharAllTextStatSentimentAllDistilBertEmbeddingPycholinguisticLIWC22EmpathTermListsRatioToxicityMBTI



0it [00:00, ?it/s][A
1it [01:51, 111.57s/it][A
2it [03:38, 109.01s/it][A
3it [05:20, 105.81s/it][A
4it [07:04, 104.85s/it][A
5it [08:44, 103.26s/it][A
6it [10:24, 102.06s/it][A
7it [12:01, 100.38s/it][A
8it [13:41, 100.42s/it][A
9it [15:21, 100.13s/it][A
10it [17:02, 102.29s/it][A
 50%|█████     | 1/2 [17:03<17:03, 1023.05s/it]


[downsampling, LogisticRegression] Developing Model and Generating Metrics for features: CountVecWordCharAllTextStatSentimentAllDistilBertEmbeddingPycholinguisticLIWC22EmpathToxicityMBTI



0it [00:00, ?it/s][A
1it [01:44, 104.06s/it][A
2it [03:21, 100.17s/it][A
3it [04:58, 98.79s/it] [A
4it [06:36, 98.38s/it][A
5it [08:14, 98.40s/it][A
6it [09:50, 97.61s/it][A
7it [11:27, 97.25s/it][A
8it [13:06, 97.70s/it][A
9it [14:42, 97.15s/it][A
10it [16:20, 98.03s/it][A
100%|██████████| 2/2 [33:23<00:00, 1001.70s/it]



