# Imports

In [1]:
import os
import pandas as pd
from multiprocessing import Pool
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# Get data locations(path)

In [2]:
#folder that contains the features and to-be-dumped files
folder = 'combined_features'
#path to the features dumped file
path_to_features_list = list(map(lambda z: os.path.join(folder, z), list(filter(lambda x: x.split('.')[-1] == 'pickle', os.listdir(folder)))))
#create a new folder for dumping new files
count = 0
folder_to_be_made = True
while os.path.exists(os.path.join(folder, 'folder_'+str(count))):
    if len(os.listdir(os.path.join(folder, 'folder_'+str(count)))) == 0:
        folder_to_be_made = False
        break
    count += 1
write_folder = os.path.join(folder, 'folder_'+str(count))
if folder_to_be_made:
    os.makedirs(write_folder)
print("Path to features: {}\n\nWrite Folder: {}".format(path_to_features_list, write_folder))

Path to features: ['combined_features/part_features_3.pickle', 'combined_features/part_features_2.pickle', 'combined_features/part_features_7.pickle', 'combined_features/part_features_10.pickle', 'combined_features/part_features_4.pickle', 'combined_features/part_features_9.pickle', 'combined_features/part_features_0.pickle', 'combined_features/part_features_8.pickle', 'combined_features/part_features_6.pickle', 'combined_features/part_features_5.pickle', 'combined_features/part_features_1.pickle']

Write Folder: combined_features/folder_0


# RUN models and save accuracy

In [3]:
def pickler_function(save_path, feature_list, data):
    model = SVC(kernel='linear')
#     model = GridSearchCV(estimator=model_classifier, param_grid={'kernel': ['rbf', 'linear']}, n_jobs=1)
#     model = RandomForestClassifier()
#     print(model)
#     return
    print("Trying to fit", feature_list)
    X_train = data['X_train']
    y_train = data['y_train']
    X_test = data['X_test']
    y_test = data['y_test']
    %time model.fit(X_train, y_train)
    print("Finished fitting", feature_list)
    y_pred = model.predict(X_test)
    print("Finished predicting", feature_list)
    evaluation_dict = {
        'Feature 1': feature_list[0], 
        'Feature 2': feature_list[1], 
        'Feature 3': feature_list[2], 
        'Feature 4': feature_list[3], 
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 score': f1_score(y_test, y_pred, average='weighted'),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }
    print("Finished evaluating", feature_list)
    pickle.dump(evaluation_dict, open(save_path, 'wb'))
    print("Dumping successful at", save_path)


# Dump the classifier evaluations

In [None]:
count = 0
for path in path_to_features_list:
    data_dict = pickle.load(open(path, 'rb'))

    print(data_dict.keys())
    print(len(data_dict['Data']))

    process_pool = Pool(processes=40)

    for i in range(len(data_dict['Data'])):
        process_pool.apply_async(pickler_function, args=(os.path.join(write_folder, "SVM_"+str(count)+'.pickle'), [data_dict['Feature '+str(j)][i] for j in range(1, 5)], data_dict['Data'][i]))
        count += 1
        
    process_pool.close()
    process_pool.join()

print("\n\n\nDONE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

dict_keys(['Data', 'Feature 3', 'Feature 4', 'Feature 1', 'Feature 2'])
48
Trying to fit ['1-5_char_gram', 'unigram_without_stopwords', 'capitals_char_1_gram', '-']
Trying to fit ['1-5_char_gram', 'char_4_gram_non_alphabet', '1-4_gram', '-']
Trying to fit ['1-5_char_gram', 'char_1_gram_non_alphabet', '1-4_gram', '-']
Trying to fit ['1-5_char_gram', 'char_5_gram_non_alphabet', '1-4_gram', '-']
Trying to fit ['char_2_gram_non_alphabet', '1-5_char_gram', '1-4_gram', '-']
Trying to fit ['1-5_char_gram', 'capitals_char_2_gram', '1-4_gram', '-']
Trying to fit ['1-5_char_gram', 'capitals_char_3_gram', '1-4_gram', '-']
Trying to fit ['capitals_char_1_gram', '1-5_char_gram', '1-4_gram', '-']
Trying to fit ['1-5_char_gram', 'capitals_char_2_gram', 'char_4_gram_non_alphabet', '-']
Trying to fit ['1-5_char_gram', 'char_4_gram_non_alphabet', 'capitals_char_3_gram', '-']
Trying to fit ['1-5_char_gram', 'char_4_gram_non_alphabet', 'capitals_char_1_gram', '-']
Trying to fit ['1-5_char_gram', 'char_1_g