In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import os
from scipy.sparse import hstack
from IPython.display import display

pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth',60)

metric_dump_file = 'metric_table.pickle'
if os.path.isfile(metric_dump_file):
    table_matrix = pd.DataFrame.from_dict(pickle.load(open(metric_dump_file, 'rb')))
    del metric_dump_file
else:
    raise Exception(str("File " + metric_dump_file + " NOT FOUND!!"))
    
table_matrix = table_matrix[['Feature', 'Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'Confusion Matrix']]
a = table_matrix['Confusion Matrix']
table_matrix['True Count'] = list(map(lambda x: x[0][0] + x[1][1] + x[2][2], a))
def feature_type(stringy):
    splity = stringy.split('_')
    if splity[-1] == 'alphabet':
        return 'punctuation'
    if splity[0] == 'capitals':
        return 'capital'
    if 'char' in splity:
        return 'character'
    return 'word'
table_matrix['Feature type'] = list(map(feature_type, table_matrix['Feature']))
del a
# display(table_matrix.head())

# Top char n-grams

In [2]:
classifier_type = 'SVM'
top_number = 4
evaluation_critera = 'F1 score'

In [3]:
top_char_features = table_matrix[(table_matrix['Classifier']==classifier_type) & (table_matrix['Feature type']=='character')].sort_values(evaluation_critera, ascending=False).head(top_number)
top_char_features

Unnamed: 0,Feature,Classifier,Accuracy,Precision,Recall,F1 score,Confusion Matrix,True Count,Feature type
126,1-5_char_gram,SVM,0.585472,0.588197,0.585472,0.579502,"[[875, 292, 66], [329, 614, 114], [138, 305, 268]]",1757,character
124,1-6_char_gram,SVM,0.582139,0.585279,0.582139,0.576301,"[[871, 300, 62], [334, 607, 116], [144, 298, 269]]",1747,character
97,2-5_char_gram,SVM,0.58014,0.580481,0.58014,0.574088,"[[876, 292, 65], [331, 596, 130], [150, 292, 269]]",1741,character
88,2-6_char_gram,SVM,0.577807,0.579623,0.577807,0.571627,"[[869, 303, 61], [333, 600, 124], [155, 291, 265]]",1734,character


# Top word n-grams

In [4]:
top_word_features = table_matrix[(table_matrix['Classifier']==classifier_type) & (table_matrix['Feature type']=='word')].sort_values(evaluation_critera, ascending=False).head(top_number)
display(top_word_features)

Unnamed: 0,Feature,Classifier,Accuracy,Precision,Recall,F1 score,Confusion Matrix,True Count,Feature type
115,1-2_gram,SVM,0.576141,0.57679,0.576141,0.569729,"[[872, 295, 66], [329, 599, 129], [142, 311, 258]]",1729,word
130,1-4_gram_without_stopwords,SVM,0.571809,0.570333,0.571809,0.564258,"[[874, 289, 70], [334, 593, 130], [159, 303, 249]]",1716,word
120,1-2_gram_without_stopwords,SVM,0.571143,0.570002,0.571143,0.563551,"[[873, 291, 69], [337, 592, 128], [161, 301, 249]]",1714,word
116,unigram,SVM,0.570477,0.571231,0.570477,0.563347,"[[872, 301, 60], [341, 588, 128], [154, 305, 252]]",1712,word


# Top punctuation n-grams

In [5]:
top_punctuation_features = table_matrix[(table_matrix['Classifier']==classifier_type) & (table_matrix['Feature type']=='punctuation')].sort_values(evaluation_critera, ascending=False).head(top_number)
display(top_punctuation_features)

Unnamed: 0,Feature,Classifier,Accuracy,Precision,Recall,F1 score,Confusion Matrix,True Count,Feature type
66,char_2_gram_non_alphabet,SVM,0.425858,0.364967,0.425858,0.333259,"[[1059, 171, 3], [834, 217, 6], [571, 138, 2]]",1278,punctuation
1,char_3_gram_non_alphabet,SVM,0.422859,0.373381,0.422859,0.326939,"[[1082, 136, 15], [863, 176, 18], [558, 142, 11]]",1269,punctuation
21,char_1_gram_non_alphabet,SVM,0.426524,0.392286,0.426524,0.324395,"[[1100, 129, 4], [874, 176, 7], [610, 97, 4]]",1280,punctuation
71,char_4_gram_non_alphabet,SVM,0.421193,0.402914,0.421193,0.32069,"[[1099, 113, 21], [898, 146, 13], [593, 99, 19]]",1264,punctuation


# top capital n-grams

In [6]:
top_capital_features = table_matrix[(table_matrix['Classifier']==classifier_type) & (table_matrix['Feature type']=='capital')].sort_values(evaluation_critera, ascending=False).head(top_number)
display(top_capital_features)

Unnamed: 0,Feature,Classifier,Accuracy,Precision,Recall,F1 score,Confusion Matrix,True Count,Feature type
36,capitals_char_2_gram,SVM,0.422526,0.422978,0.422526,0.312094,"[[1134, 83, 16], [936, 107, 14], [597, 87, 27]]",1268,capital
11,capitals_char_3_gram,SVM,0.427857,0.431222,0.427857,0.310054,"[[1163, 57, 13], [946, 96, 15], [603, 83, 25]]",1284,capital
6,capitals_char_1_gram,SVM,0.415861,0.307224,0.415861,0.307479,"[[1099, 134, 0], [908, 149, 0], [603, 108, 0]]",1248,capital
41,capitals_char_4_gram,SVM,0.415861,0.430207,0.415861,0.257282,"[[1222, 6, 5], [1033, 16, 8], [687, 14, 10]]",1248,capital


Load test labels for comparison

In [7]:
y_test = pickle.load(open('features/1-1_char_gram.pickle', 'rb'))['y_test']
y_test

1510    1
1802    1
1740    2
2878    1
2719    3
2704    3
1160    1
566     3
1893    2
26      1
1941    1
806     1
2813    2
1853    2
819     1
1026    3
216     3
1125    2
2530    1
1565    2
449     3
1666    3
1691    2
2635    1
1486    2
2020    1
2657    2
2936    1
692     3
2312    2
821     2
2662    1
420     1
2986    3
2542    3
505     2
2127    2
1634    1
1245    1
2188    1
2470    2
1970    3
2346    1
107     1
2595    2
551     2
366     1
2857    1
1737    3
661     1
2209    2
2830    3
2116    2
2502    2
921     2
2539    1
2797    1
1643    3
845     1
1693    1
109     2
1468    1
2451    1
1102    1
1826    2
21      3
1701    2
1332    2
1817    1
80      3
892     3
417     2
2334    1
173     2
104     2
1039    3
2685    3
940     3
55      1
646     2
1684    2
2133    2
1193    1
1519    1
2787    2
1985    1
190     3
2992    1
2370    1
1190    1
975     1
1900    1
781     2
1124    2
761     2
409     2
2921    2
827     2
2434    3
700     3


data_dict = {}
count = 0
total = len(list(top_char_features['Feature']) + \
            list(top_word_features['Feature']) + \
            list(top_punctuation_features['Feature']) + \
            list(top_capital_features['Feature']))

for feature in list(top_char_features['Feature']) + \
            list(top_word_features['Feature']) + \
            list(top_punctuation_features['Feature']) + \
            list(top_capital_features['Feature']):
    file = feature+'.pickle'
    print(feature)
    if os.path.isfile(os.path.join('results/', file)):
        #load the classifier
        data_dict[feature] = [pickle.load(open(os.path.join('results/', file), 'rb'))['SVM'][0]]
        print("Loading of classfier successful!")
        #load the X_train
        if os.path.isfile(os.path.join('features/', file)):
            X_test = pickle.load(open(os.path.join('features/', file), 'rb'))['X_test']
        elif os.path.isfile(os.path.join('extra_features/', file)):
            X_test = pickle.load(open(os.path.join('extra_features/', file), 'rb'))['X_test']
        print("Loading of X_test successful!")
        %time data_dict[feature].append(data_dict[feature][0].predict(X_test.toarray()))
    elif os.path.isfile(os.path.join('results/SVM/', file)):
        #load the classifier
        data_dict[feature] = [pickle.load(open(os.path.join('results/SVM/', file), 'rb'))['SVM'][0]]
        print("Loading of classfier successful!")
        #load the X_train
        if os.path.isfile(os.path.join('features/', file)):
            X_test = pickle.load(open(os.path.join('features/', file), 'rb'))['X_test']
        elif os.path.isfile(os.path.join('extra_features/', file)):
            X_test = pickle.load(open(os.path.join('extra_features/', file), 'rb'))['X_test']
        print("Loading of X_test successful!")
        %time data_dict[feature].append(data_dict[feature][0].predict(X_test.toarray()))
    else:
        raise Exception(file + " not found in either folder.")
    count += 1
    print("Done {} out of {}".format(count, total), end='\n\n\n')

In [8]:
%time data_dict = pickle.load(open("bagging.pickle", 'rb'))

CPU times: user 6.56 s, sys: 24.4 s, total: 31 s
Wall time: 31 s


In [9]:
def most_likely_prediction(prediction_list, likelihood):
    outcome = [0] * (max(prediction_list) + 10)
    maxy = 0
    for i in range(len(prediction_list)):
        outcome[prediction_list[i]] += likelihood[i]
        if outcome[prediction_list[i]] > outcome[maxy]:
            maxy = prediction_list[i]
    return maxy
    

def analyse_and_show(prediction_list):
#     truth_serum_1 = (prediction_1 == y_test)
#     truth_serum_2 = (prediction_2 == y_test)
#     truth_serum_3 = (prediction_3 == y_test)
    truth_serum_list = [(prediction == y_test) for prediction in prediction_list]
    likelihood_list = [np.sum(serum)/len(serum) for serum in truth_serum_list]
#     common_true = 0
#     common_false = 0
#     first_true_only = 0
#     second_true_only = 0
#     for i in range(len(truth_serum_1)):
#         if truth_serum_1[i] and truth_serum_2[i]:
#             common_true += 1
#         elif not truth_serum_1[i] and not truth_serum_2[i]:
#             common_false += 1
#         elif truth_serum_1[i] and not truth_serum_2[i]:
#             first_true_only += 1
#         elif not truth_serum_1[i] and truth_serum_2[i]:
#             second_true_only += 1
#     print("{}\t{}\t{}\t{}\n".format(common_true, common_false, first_true_only, second_true_only))
#     final_serum = np.sum(np.ndarray(list(map(lambda x: most_likely_prediction(x, likelihood_list), prediction_list))) == y_test)
    final_serum = [most_likely_prediction([prediction_list[0][i], prediction_list[1][i], prediction_list[2][i]], likelihood_list) for i in range(len(prediction_list[0]))]
    final_serum = (np.array(final_serum) == y_test)
    return np.sum(final_serum), [np.sum(serum) for serum in truth_serum_list]

In [13]:
%%time

from itertools import combinations

for combo in combinations(data_dict.keys(), 3):
    result = analyse_and_show([data_dict[feature][1] for feature in combo])
    if result[0] > 1760:
        print(combo)
        print(result)

('1-5_char_gram', '1-6_char_gram', '2-5_char_gram')
(1761, [1757, 1747, 1741])
('1-5_char_gram', 'char_4_gram_non_alphabet', 'unigram')
(1761, [1757, 1264, 1712])
CPU times: user 17.4 s, sys: 60 ms, total: 17.4 s
Wall time: 17.5 s


In [11]:
for feature in data_dict:
    y_pred = data_dict[feature][1]
    print(feature)
    print("Accuracy: ", accuracy_score(y_pred=y_pred, y_true=y_test))
    print("F1 score: ", f1_score(y_pred=y_pred, y_true=y_test, average='weighted'))
#     print(confusion_matrix(y_pred=data_dict[feature][1], y_true=y_test))
#     a = confusion_matrix(y_pred=y_pred, y_true=y_test)
#     print(a[0][0] + a[1][1] + a[2][2])
    print(np.sum((y_pred) == (y_test))/len(y_pred), '/', len(y_pred))
    print()
    print()
    print()

capitals_char_2_gram
Accuracy:  0.422525824725
F1 score:  0.31209357838
0.422525824725 / 3001



1-5_char_gram
Accuracy:  0.585471509497
F1 score:  0.579501600226
0.585471509497 / 3001



1-6_char_gram
Accuracy:  0.582139286904
F1 score:  0.57630130703
0.582139286904 / 3001



capitals_char_1_gram
Accuracy:  0.41586137954
F1 score:  0.307479134525
0.41586137954 / 3001



2-5_char_gram
Accuracy:  0.580139953349
F1 score:  0.574087567708
0.580139953349 / 3001



char_4_gram_non_alphabet
Accuracy:  0.421192935688
F1 score:  0.320689528553
0.421192935688 / 3001



char_1_gram_non_alphabet
Accuracy:  0.426524491836
F1 score:  0.324395387394
0.426524491836 / 3001



char_3_gram_non_alphabet
Accuracy:  0.422859046984
F1 score:  0.326939126116
0.422859046984 / 3001



1-2_gram_without_stopwords
Accuracy:  0.571142952349
F1 score:  0.563551092852
0.571142952349 / 3001



capitals_char_4_gram
Accuracy:  0.41586137954
F1 score:  0.257282314998
0.41586137954 / 3001



unigram
Accuracy:  0.57047650

  'precision', 'predicted', average, warn_for)
