# Imports

In [14]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from multiprocessing import Pool
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# Get data locations(path)

In [15]:
path_to_features_list = ['features/'+i for i in os.listdir('features')] + \
                         ['extra_features/'+i for i in os.listdir('extra_features')] + \
                        ['pos_features/'+i for i in os.listdir('pos_features')] 
print(len(path_to_features_list))

32


In [16]:
data = pickle.load(open(path_to_features_list[0], 'rb'))
X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']

print(y_train.value_counts(), "\n\n", y_test.value_counts())

3    8716
1    6285
Name: Label_num, dtype: int64 

 1    630
3    286
Name: Label_num, dtype: int64


# RUN models and save accuracy

In [17]:
def pickler_function(estimator, grid_label, save_path, feature_path):
    model = estimator
    data = pickle.load(open(feature_path, 'rb'))
    X_train = data['X_train']
    y_train = data['y_train']
    X_test = data['X_test']
    y_test = data['y_test']
#     print(model)
#     return
    print("Trying to fit", save_path, X_train.shape)
    %time model.fit(X_train, y_train)
    evaluation_dict = {'Criteria': ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'Confusion Matrix']}
    print("Finished fitting", save_path)
    y_pred = model.predict(X_test)
    print("Finished predicting", save_path)
    evaluation_dict[grid_label] = [
                                    model,
                                    accuracy_score(y_test, y_pred),
                                    precision_score(y_test, y_pred, average='weighted'),
                                    recall_score(y_test, y_pred, average='weighted'),
                                    f1_score(y_test, y_pred, average='weighted'),
                                    confusion_matrix(y_test, y_pred)
                                    ]
    print("Finished evaluating", save_path)
    pickle.dump(evaluation_dict, open(save_path, 'wb'))
    print("Dumping successful for", save_path)


# Change for different classifiers

In [18]:
%%time

num_load_features_at_once = 50
# copy_of_path_to_features_list = list(path_to_features_list)
force_write = True
process_pool = Pool(processes=num_load_features_at_once)
c_value = {'rbf': 1, 'linear': 1}
write_folder = 'results/SVM_brown/'

#empty directory
for path in os.listdir(write_folder):
    if os.path.isfile(os.path.join(write_folder, path)):
        os.remove(os.path.join(write_folder, path))
        print("Removed", path)
    
for read_path in path_to_features_list:
    write_path = write_folder + read_path.split('/')[-1]
    if os.path.isfile(write_path) and not force_write:
        print(write_path, "already exists. Moving on........\n\n")
        continue
    process_pool.apply_async(pickler_function, args=(
                                                    SVC(C=c_value['linear'], kernel='linear'),
                                                    'SVM_linear',
                                                    write_path,
                                                    read_path
                                                    )
                            )
process_pool.close()
process_pool.join()
print("Completed.")

Trying to fit results/SVM_brown/trigram.pickle (15001, 11240)
Trying to fit results/SVM_brown/quadgram.pickle (15001, 5320)
Trying to fit results/SVM_brown/3-4_gram.pickle (15001, 16560)
Trying to fit results/SVM_brown/fivegram.pickle (15001, 4265)
Trying to fit results/SVM_brown/bigram.pickle (15001, 20000)
Trying to fit results/SVM_brown/unigram.pickle (15001, 8656)
Removed char_3_gram_non_alphabet.pickle
Removed capitals_char_1_gram.pickle
Removed 1-3_gram_without_stopwords.pickle
Removed 7-7_char_gram.pickle
Removed capitals_char_3_gram.pickle
Removed quadgram.pickle
Removed char_7_gram_non_alphabet.pickle
Removed 5-6_char_gram.pickle
Removed char_1_gram_non_alphabet.pickle
Removed 1-1_char_gram.pickle
Removed capitals_char_8_gram.pickle
Removed trigram.pickle
Removed capitals_char_6_gram.pickle
Removed 3-4_gram.pickle
Removed capitals_char_2_gram.pickle
Removed capitals_char_4_gram.pickle
Removed bigram_without_stopwords.pickle
Removed quadgram_without_stopwords.pickle
Removed 3-4

Wall time: 6min 18s
Finished fitting results/SVM_brown/2-3_char_gram.pickle
Finished predicting results/SVM_brown/2-3_char_gram.pickle
Finished evaluating results/SVM_brown/2-3_char_gram.pickle
Dumping successful for results/SVM_brown/2-3_char_gram.pickle
CPU times: user 6min 47s, sys: 1.21 s, total: 6min 48s
Wall time: 6min 49s
Finished fitting results/SVM_brown/3-4_char_gram.pickle
Finished predicting results/SVM_brown/3-4_char_gram.pickle
Finished evaluating results/SVM_brown/3-4_char_gram.pickle
Dumping successful for results/SVM_brown/3-4_char_gram.pickle
CPU times: user 7min 5s, sys: 1.36 s, total: 7min 7s
Wall time: 7min 7s
Finished fitting results/SVM_brown/1-3_char_gram.pickle
Finished predicting results/SVM_brown/1-3_char_gram.pickle
Finished evaluating results/SVM_brown/1-3_char_gram.pickle
Dumping successful for results/SVM_brown/1-3_char_gram.pickle
CPU times: user 7min 37s, sys: 1.04 s, total: 7min 38s
Wall time: 7min 38s
Finished fitting results/SVM_brown/3-5_char_gram.p