In [1]:
import pickle
import pandas as pd
import numpy as np
import os
from scipy.sparse import hstack
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from multiprocessing import Pool

# Change boolean values in the below cell and run the entire notebook

In [2]:
do_classifier_and_vectorizer_dumping = False
do_predictions_again = False #if false, will pick up from a dumped file
do_metric_table_dumping = False

In [3]:
%%time

#dictionary format for dumping classifiers and vectorizers
dump_dictionary_format = {
    'OAG_singular_vectorizer': 0,
    'OAG_singular_classifier' : 0,
    'NAG_singular_vectorizer': 0,
    'NAG_singular_classifier' : 0,
    'general_vectorizer': 0,
    'general_classifier' : 0,
}

#aggression label to label_nums
dic_aggression_level = {
    'NAG' : 1,
    'CAG' : 2,
    'OAG' : 3
}

dic_reverse_aggression_level = {}
for i in dic_aggression_level:
    dic_reverse_aggression_level[dic_aggression_level[i]] = i
    
print(dic_aggression_level, '\n', dic_reverse_aggression_level)

#declaring the required TfidfVectorizer's
max_features = 20000

feature_dict = {
    'unigram' : TfidfVectorizer(max_features=max_features, min_df=3),
    '1-2_gram'  : TfidfVectorizer(ngram_range=(1, 2), max_features=max_features, min_df=3),
    '1-3_gram'  : TfidfVectorizer(ngram_range=(1, 3), max_features=max_features, min_df=3),
    '1-4_gram'  : TfidfVectorizer(ngram_range=(1, 4), max_features=max_features, min_df=3),
    'unigram_without_stopwords' : TfidfVectorizer(stop_words='english', 
                                                  max_features=max_features, min_df=3),
    '1-2_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 2), stop_words='english',
                                                  max_features=max_features, min_df=3),
    '1-3_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 3), stop_words='english',
                                                  max_features=max_features, min_df=3),
    '1-4_gram_without_stopwords'  : TfidfVectorizer(ngram_range=(1, 4), stop_words='english',
                                                  max_features=max_features, min_df=3),
	'1-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,4), 
                                    max_features=max_features, min_df=3),
	'1-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,5), 
                                    max_features=max_features, min_df=3),
	'1-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,6), 
                                    max_features=max_features, min_df=3),
	'1-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,7), 
                                    max_features=max_features, min_df=3),
	'1-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(1,8), 
                                    max_features=max_features, min_df=3),
	'2-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,4), 
                                    max_features=max_features, min_df=3),
	'2-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,5), 
                                    max_features=max_features, min_df=3),
	'2-6_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,6), 
                                    max_features=max_features, min_df=3),
	'2-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,7), 
                                    max_features=max_features, min_df=3),
	'2-8_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(2,8), 
                                    max_features=max_features, min_df=3),
	'3-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,4), 
                                    max_features=max_features, min_df=3),
	'3-7_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(3,7), 
                                    max_features=max_features, min_df=3),
	'4-4_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,4), 
                                    max_features=max_features, min_df=3),
	'4-5_char_gram'  : TfidfVectorizer(analyzer='char', ngram_range=(4,5), 
                                    max_features=max_features, min_df=3),
}


#train data
train_pd = pd.read_csv("train.csv")
train_pd.drop('ID',1,inplace=True)
train_pd = shuffle(train_pd)
#data for general classifier
general_data = train_pd[['Data', 'Label']]
general_data['Label_num'] = general_data.Label.map(dic_aggression_level)
print("\n\ngeneral data\n", general_data['Label'].value_counts())
#data for OAG singular classifier
OAG_singular_data = general_data[['Data', 'Label']]
OAG_singular_data.Label.replace('CAG', "NAG", inplace=True)
OAG_singular_data['Label_num'] = OAG_singular_data.Label.map(dic_aggression_level)
OAG_singular_data = OAG_singular_data.append(OAG_singular_data[OAG_singular_data['Label']=='OAG'][:500])
OAG_singular_data = OAG_singular_data[OAG_singular_data['Label'] == 'OAG'][:3200].append(
                    OAG_singular_data[OAG_singular_data['Label'] == 'NAG'][:3000])
print("\n\nOAG_singular_data\n", OAG_singular_data['Label'].value_counts())
#data for NAG singular classifier
NAG_singular_data = general_data[['Data', 'Label']]
NAG_singular_data.Label.replace('CAG', "OAG", inplace=True)
NAG_singular_data['Label_num'] = NAG_singular_data.Label.map(dic_aggression_level)
NAG_singular_data = NAG_singular_data[NAG_singular_data['Label'] == 'NAG'][:5000].append(
                    NAG_singular_data[NAG_singular_data['Label'] == 'OAG'][:4700])
print("\n\nNAG_singular_data\n", NAG_singular_data['Label'].value_counts())

save_folder = 'binary_ensemble/'

#dumper function
from copy import deepcopy
def dump_vectorizer_and_classifier(vectorizer, save_path):
    print("Started", save_path)
    #general
    general_vectorizer = deepcopy(vectorizer)
    general_classifier = SVC(kernel='linear')
    general_classifier.fit(general_vectorizer.fit_transform(general_data['Data']), 
                           general_data['Label_num'])
    #OAG
    OAG_singular_vectorizer = deepcopy(vectorizer)
    OAG_singular_classifier = SVC(kernel='linear')
    OAG_singular_classifier.fit(OAG_singular_vectorizer.fit_transform(OAG_singular_data['Data']),
                                OAG_singular_data['Label_num'])
    #NAG
    NAG_singular_vectorizer = deepcopy(vectorizer)
    NAG_singular_classifier = SVC(kernel='linear')
    NAG_singular_classifier.fit(NAG_singular_vectorizer.fit_transform(NAG_singular_data['Data']),
                                NAG_singular_data['Label_num'])
    #dumping
    pickle.dump({
                'OAG_singular_vectorizer': OAG_singular_vectorizer,
                'OAG_singular_classifier' : OAG_singular_classifier,
                'NAG_singular_vectorizer': NAG_singular_vectorizer,
                'NAG_singular_classifier' : NAG_singular_classifier,
                'general_vectorizer': general_vectorizer,
                'general_classifier' : general_classifier,
                }, 
        open(save_path, 'wb'))
    print("Completed", save_path)

if do_classifier_and_vectorizer_dumping:
    process_pool = Pool(processes=30)
    for label in feature_dict:
        process_pool.apply_async(dump_vectorizer_and_classifier, args=(feature_dict[label],
                                                      os.path.join(save_folder, label+'.pickle'),))
    process_pool.close()
    process_pool.join()
else:
    pass

{'NAG': 1, 'CAG': 2, 'OAG': 3} 
 {1: 'NAG', 2: 'CAG', 3: 'OAG'}


general data
 NAG    5052
CAG    4240
OAG    2708
Name: Label, dtype: int64


OAG_singular_data
 OAG    3200
NAG    3000
Name: Label, dtype: int64


NAG_singular_data
 NAG    5000
OAG    4700
Name: Label, dtype: int64
Didn't do jack!
CPU times: user 132 ms, sys: 16 ms, total: 148 ms
Wall time: 144 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [4]:
%%time

import warnings
warnings.filterwarnings('ignore')

read_folder = 'binary_ensemble/'
save_folder = 'binary_ensemble/results/'
feature_file_paths = list(filter(lambda x: x.split('.')[-1]=='pickle', os.listdir(read_folder)))
if "prediction.pickle" in feature_file_paths:
    feature_file_paths.remove("prediction.pickle")
# print(feature_file_paths)
#test data
test_pd = pd.read_csv("valid.csv")
test_pd.drop('ID',1,inplace=True)
test_pd = shuffle(test_pd)
test_pd['Label_num'] = test_pd.Label.map(dic_aggression_level)
y_test = test_pd['Label_num']
#test data for general classifier
general_data = test_pd[['Data', 'Label', 'Label_num']]
print("\n\ngeneral data\n", general_data['Label'].value_counts())
#test data for OAG singular classifier
OAG_singular_data = general_data[['Data', 'Label']]
OAG_singular_data.Label.replace('CAG', "NAG", inplace=True)
OAG_singular_data['Label_num'] = OAG_singular_data.Label.map(dic_aggression_level)
print("\n\nOAG_singular_data\n", OAG_singular_data['Label'].value_counts())
#test data for NAG singular classifier
NAG_singular_data = general_data[['Data', 'Label']]
NAG_singular_data.Label.replace('CAG', "OAG", inplace=True)
NAG_singular_data['Label_num'] = NAG_singular_data.Label.map(dic_aggression_level)
print("\n\nNAG_singular_data\n", NAG_singular_data['Label'].value_counts())

def evaluate(OAG_singular_prediction, NAG_singular_prediction, general_prediction, save_path, feature_label_list):
    OAG = dic_aggression_level['OAG']
    NAG = dic_aggression_level['NAG']
    CAG = dic_aggression_level['CAG']
    
    map_dict = {
        (OAG, OAG, OAG): OAG,
        (NAG, NAG, OAG): NAG,
        (OAG, NAG, OAG): OAG,
        (NAG, OAG, OAG): CAG,
        
        (OAG, OAG, NAG): OAG,
        (NAG, NAG, NAG): NAG,
        (OAG, NAG, NAG): NAG,
        (NAG, OAG, NAG): CAG,
        
        (OAG, OAG, CAG): OAG,
        (NAG, NAG, CAG): NAG,
        (OAG, NAG, CAG): CAG,
        (NAG, OAG, CAG): CAG,
    }
    y_pred = np.array(list(map(lambda x: map_dict[x], list(zip(OAG_singular_prediction, NAG_singular_prediction, general_prediction)))))
    pickle.dump({
        "OAG_singular_feature": feature_label_list[0],
        "NAG_singular_feature": feature_label_list[1],
        "general_feature": feature_label_list[2],
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted'),
        "recall": recall_score(y_test, y_pred, average='weighted'),
        "f1_score": f1_score(y_test, y_pred, average='weighted'),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }, open(save_path, 'wb'))
#     print(f1_score(y_test, y_pred, average='weighted'))

if do_metric_table_dumping:
    count = 0
    data_dict = {}
    if do_predictions_again:
        for path in feature_file_paths:
            label = path.split('.')[0]
            data = pickle.load(open(os.path.join(read_folder, path), 'rb'))
            data_dict[label] = {'OAG_singular_predictions' : data['OAG_singular_classifier'].predict(data['OAG_singular_vectorizer'].transform(OAG_singular_data['Data'])),
                                'NAG_singular_predictions' : data['NAG_singular_classifier'].predict(data['NAG_singular_vectorizer'].transform(NAG_singular_data['Data'])),
                                'general_predictions' : data['general_classifier'].predict(data['general_vectorizer'].transform(general_data['Data']))}
            count += 1
            print("Completed predicting {} out of {}.".format(count, len(feature_file_paths)))
    else:
        data_dict = pickle.load(open("binary_ensemble/prediction.pickle", 'rb'))
    count = 0
    print("Finished predicting.")
    for OAG in data_dict:
        for NAG in data_dict:
            for general in data_dict:
                evaluate(data_dict[OAG]['OAG_singular_predictions'], data_dict[NAG]['NAG_singular_predictions'],
                    data_dict[general]['general_predictions'], os.path.join(save_folder, str(count)+'.pickle'), 
                         [OAG, NAG, general])
                count += 1
    if do_predictions_again:
        pickle.dump(data_dict, open("binary_ensemble/prediction.pickle", 'wb'))
    del data_dict
else:
    pass



general data
 NAG    1233
CAG    1057
OAG     711
Name: Label, dtype: int64


OAG_singular_data
 NAG    2290
OAG     711
Name: Label, dtype: int64


NAG_singular_data
 OAG    1768
NAG    1233
Name: Label, dtype: int64
CPU times: user 124 ms, sys: 4 ms, total: 128 ms
Wall time: 124 ms


In [5]:
metric_table = {
        "OAG_singular_feature": [],
        "NAG_singular_feature": [],
        "general_feature": [],
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1_score": [],
        "confusion_matrix": []
}

for file in os.listdir("binary_ensemble/results/"):
    data_dict = pickle.load(open(os.path.join("binary_ensemble/results/", file), 'rb'))
    for key in data_dict:
        metric_table[key].append(data_dict[key])
        

pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 60)
metric_table = pd.DataFrame.from_dict(metric_table)
metric_table = metric_table[['OAG_singular_feature', 'NAG_singular_feature', 'general_feature', 'accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix']]
display(metric_table.sort_values("f1_score", ascending=False).head(10))

Unnamed: 0,OAG_singular_feature,NAG_singular_feature,general_feature,accuracy,precision,recall,f1_score,confusion_matrix
6187,3-4_char_gram,1-3_gram_without_stopwords,4-5_char_gram,0.571809,0.57614,0.571809,0.564828,"[[855, 210, 168], [306, 391, 360], [111, 130, 470]]"
3195,3-4_char_gram,1-3_gram_without_stopwords,4-4_char_gram,0.571476,0.575882,0.571476,0.564238,"[[857, 208, 168], [307, 389, 361], [115, 127, 469]]"
9656,4-4_char_gram,1-3_gram_without_stopwords,4-5_char_gram,0.57081,0.574602,0.57081,0.563911,"[[855, 212, 166], [307, 391, 359], [111, 133, 467]]"
5868,4-4_char_gram,1-3_gram_without_stopwords,4-4_char_gram,0.57081,0.574658,0.57081,0.563873,"[[856, 212, 165], [306, 391, 360], [114, 131, 466]]"
8799,3-4_char_gram,1-4_gram_without_stopwords,4-5_char_gram,0.57081,0.574729,0.57081,0.563691,"[[855, 211, 167], [308, 389, 360], [111, 131, 469]]"
9606,3-4_char_gram,1-3_gram_without_stopwords,2-7_char_gram,0.570477,0.574602,0.570477,0.563681,"[[850, 214, 169], [307, 391, 359], [107, 133, 471]]"
2196,3-4_char_gram,1-3_gram_without_stopwords,1-6_char_gram,0.570477,0.574201,0.570477,0.563638,"[[849, 216, 168], [309, 391, 357], [105, 134, 472]]"
10388,3-4_char_gram,1-3_gram_without_stopwords,1-5_char_gram,0.570477,0.574502,0.570477,0.563575,"[[850, 213, 170], [309, 390, 358], [105, 134, 472]]"
8240,3-4_char_gram,unigram_without_stopwords,4-5_char_gram,0.57081,0.574033,0.57081,0.563555,"[[858, 210, 165], [315, 393, 349], [123, 126, 462]]"
8833,4-4_char_gram,1-3_gram_without_stopwords,1-5_char_gram,0.570143,0.573594,0.570143,0.563552,"[[850, 217, 166], [308, 393, 356], [106, 137, 468]]"
