In [4]:
import file_manager
import numpy as np

y_train = np.load("./vectorizers/labels_train.npy")
y_test = np.load("./vectorizers/labels_test.npy")

# Dictionary to store all the different file names of the matrices
file_names = {"bow": {"train": [], "test": []}, "tfidf": {"train": [], "test": []}}
for file in file_manager.files_in_path("./vectorizers"):
    if "labels" not in file and "vect" not in file and ".gitignore" not in file:
        name = file.split("_")
        file_names[name[1]][name[0]].append(file)

In [5]:
import datetime
import re
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

execution_results = []
# Define your models here:
# [model, *dict* with params to be tested, *list* of ONE item with the scaler to be used]
models = [
    [MultinomialNB(),
        {"alpha": [0.1, 0.2, 0.4, 0.6, 0.8, 1]},
        []],
    [ComplementNB(),
        {"alpha": [0.1, 0.2, 0.4, 0.6, 0.8, 1]},
        []],
    [BernoulliNB(),
        {"alpha": [0.1, 0.2, 0.4, 0.6, 0.8, 1]},
        []],
    [SGDClassifier(n_jobs=-1, max_iter=1000),
        {"alpha":[0.0001, 0.001, 0.01]},
        [StandardScaler(with_mean=False)]],
]

for vectorizer_type in file_names.keys():
    for train_file, test_file in zip(file_names[vectorizer_type]["train"], file_names[vectorizer_type]["test"]):
        X_train = sparse.load_npz("./vectorizers/{}".format(train_file))
        X_test = sparse.load_npz("./vectorizers/{}".format(test_file))

        for classifier in models: 
            print("Training {}...".format(type(classifier[0]).__name__))
            begin_time = datetime.datetime.now()
            if classifier[1]:
                # Train with grid search with cross-validation if parameters are provided
                grid = GridSearchCV(estimator=classifier[0], param_grid=classifier[1], scoring="accuracy", verbose=True,
                                    cv=3, n_jobs=-1)
                # Scale the dataset if needed, else, use the normal dataset
                grid.fit(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train), y_train)
                # Select the best classifier obtained through grid search
                clf = grid.best_estimator_
            else:
                # Default train if no parameters provided
                clf = classifier[0].fit(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train), y_train)
            measured_time_train = datetime.datetime.now() - begin_time

            # Predict with the test set and labels
            begin_time = datetime.datetime.now()
            y_pred = clf.predict(X_test if not classifier[2] else classifier[2][0].fit_transform(X_test))
            measured_time_test = datetime.datetime.now() - begin_time

            # Save all the results in a dictionary
            train_results = classification_report(y_train, clf.predict(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train)), target_names=["non-depression", "depression"], output_dict=True)
            test_results = classification_report(y_test, y_pred, target_names=["non-depression", "depression"], output_dict=True)
            execution_results.append({"model": str(clf),
            "train_accuracy": train_results["accuracy"],
            "test_accuracy": test_results["accuracy"],
            "train_results": train_results,
            "test_results": test_results,
            "measured_time_train": str(measured_time_train),
            "measured_time_test": str(measured_time_test),
            "type": "{} with ({}) n-grams".format(vectorizer_type.upper(), "-".join(re.findall(r'\d+', train_file)))})

Training MultinomialNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training ComplementNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training BernoulliNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training SGDClassifier...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Training MultinomialNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training ComplementNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training BernoulliNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training SGDClassifier...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Training MultinomialNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training ComplementNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training BernoulliNB...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Training SGDClassifier...
Fitting 3 folds for each of 3 candidate

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    6.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    6.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   18.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    6.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    7.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18

In [6]:
import pandas as pd

# Transform the results' dictionary into a pandas dataframe
execution_results_df = pd.DataFrame.from_dict(execution_results, orient="columns")

In [7]:
# Save the dataframe to excel and json
execution_results_df.to_excel("execution_results.xlsx")
execution_results_df.to_json("execution_results.json", orient="records")
