In [109]:
##CHANGE FILE NAMES AS PER THE MODEL.

#!/usr/bin/env python3

import csv
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix

In [110]:
# column header
cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [111]:
true_labels = pd.read_csv('./data/test_labels.csv')

In [112]:
true_labels = true_labels[true_labels.toxic != -1]

In [113]:
true_labels = true_labels.values

In [114]:
pred_labels = pd.read_csv('./NBSVM_results.csv')

In [115]:
pred_labels = pred_labels.values

In [116]:
ids = pred_labels[:, 0]
ids = np.reshape(ids, (-1, 1))

In [117]:
labels = pred_labels[:, 1:]

In [118]:
labels = labels > 0.5

In [119]:
labels = labels.astype(int)

In [120]:
pred_labels = np.concatenate((ids, labels), axis=1)

In [121]:
true_labels_df = pd.DataFrame(data=true_labels)

In [122]:
pred_labels_df = pd.DataFrame(data=pred_labels)

In [123]:
all_labels = pd.merge(true_labels_df, pred_labels_df, on=[0])

In [124]:
all_labels = all_labels.drop(all_labels.columns[0], axis=1)

In [125]:
all_labels = all_labels.values
all_labels = np.array(all_labels, dtype=np.int8)

In [126]:
def make_cm(modelname, labels):
    for i in range(6):
        y_true = labels[:, i]
        y_pred = labels[:, 6+i]
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        accuracy = (tp + tn) / (tn + tp + fn + fp)
        filename = modelname + '_' + cols[i] + '.csv'
        with open(filename, 'w') as f:
            write_file = csv.writer(f)
            write_file.writerow([tn, fp, fn, tp, accuracy])

In [127]:
make_cm('NBSVM', all_labels)

In [128]:
def take_average(path, modelname):
    #files = Path(dir).glob('*.csv')
    average = 0
    for i in range(6):
        file = Path(path).joinpath(modelname + '_' + cols[i] + '.csv')
        df = pd.read_csv(file, header=None)
        result = df.values.flatten()
        average += result[4]
    average /= 6
    filename = Path.cwd().joinpath('average_NBSVM.csv')
    with open(filename, 'w') as f:
        f_write = csv.writer(f)
        f_write.writerow([average])

In [129]:
take_average('./', 'NBSVM')

In [130]:
train = pd.read_csv('./data/train.csv')
labels = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [131]:
labels_df = pd.read_csv('./data/train.csv')
#nonempty = labels[~np.all(labels == 0, axis=1)]

In [132]:
labels = labels_df[cols].values

In [133]:
nonempty = labels[~np.all(labels == 0, axis=1)]

In [134]:
print(len(nonempty))

16225


In [135]:
def calculate_precision_and_recall(modelname, labels, path):
    for i in range(6):
        y_true = labels[:, i]
        y_pred = labels[:, 6+i]
        filename = path + modelname + '_' + cols[i] + '.csv'
        with open(filename, 'r') as file:
            write = csv.writer(file)
            df = pd.read_csv(filename, header=None)
            #result = df.values.ravel()
            tn, fp, fn, tp, accuracy = df.values.ravel()
            tpr = tp / (tp + fp)
            recall = tp / (tp + fn)
            tnr = tn / (tn + fp)
            balanced_accuracy = (tpr + tnr) / 2
        with open(filename, 'a') as f:
            write_file = csv.writer(f)
            write_file.writerow([tpr, recall, balanced_accuracy]) 

In [136]:
calculate_precision_and_recall('NBSVM', all_labels, './')

In [137]:
def take_avg_acc(modelname, path):
    sum_average = 0
    for i in range(6):
        filename = path + modelname + '_' + cols[i] + '.csv'
        with open(filename, 'r') as file:
            write = csv.writer(file)
            df = pd.read_csv(filename, header=None)
            result = df.iloc[[1]].values.ravel()[2]
            sum_average += result
    sum_average /= 6
    save_file = path + 'average_' + modelname + '.csv'
    with open(save_file, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([sum_average])

In [138]:
take_avg_acc('NBSVM', './')

In [139]:
from sklearn.metrics import log_loss

def cal_log_loss(modelname, labels, path):
    average_log_loss = 0
    for i in range(6):
        y_true = labels[:, i]
        y_true = np.array(y_true, dtype=np.int8)
        y_pred = labels[:, 6+i]
        filename = path + modelname + '_' + cols[i] + '.csv'
        average_log_loss += log_loss(y_true, y_pred)
    average_log_loss /= 6
    save_file = path + 'average_' + modelname + '.csv'
    with open(save_file, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([average_log_loss])

In [140]:
cal_log_loss('NBSVM', all_labels, './')

In [141]:
from sklearn.metrics import hamming_loss

def cal_hamming_loss(modelname, labels, path):
    average_log_loss = 0
    for i in range(6):
        y_true = labels[:, i]
        y_pred = labels[:, 6+i]
        filename = path + modelname + '_' + cols[i] + '.csv'
        average_log_loss += hamming_loss(y_true, y_pred)
    average_log_loss /= 6
    save_file = path + 'average_' + modelname + '.csv'
    with open(save_file, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([average_log_loss])

In [142]:
cal_hamming_loss('NBSVM', all_labels, './')