In [9]:
import file_manager
import numpy as np

y_train = np.load("./vectorizers/labels_train.npy")
y_test = np.load("./vectorizers/labels_test.npy")

# Dictionary to store all the different file names of the matrices
file_names = {"bow": {"train": [], "test": []}, "tfidf": {"train": [], "test": []}}
for file in file_manager.files_in_path("./vectorizers"):
    if "labels" not in file and "vect" not in file and ".gitignore" not in file:
        name = file.split("_")
        file_names[name[1]][name[0]].append(file)

In [10]:
import datetime
import re
from joblib import dump, load
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

execution_results = []
# Define your models here:
# [model, *dict* with params to be tested, *list* of ONE item with the scaler to be used]
models = [
    [MultinomialNB(),
        {"alpha": [0.1, 0.2, 0.4, 0.6, 0.8, 1]},
        []],
    [ComplementNB(),
        {"alpha": [0.1, 0.2, 0.4, 0.6, 0.8, 1]},
        []],
    [SGDClassifier(n_jobs=-1, max_iter=1000),
        {"alpha": 10.0 ** -np.arange(1,7)},
        []],
]

best_score = 0
best_clf = None
for vectorizer_type in file_names.keys():
    for train_file, test_file in zip(file_names[vectorizer_type]["train"], file_names[vectorizer_type]["test"]):
        X_train = sparse.load_npz("./vectorizers/{}".format(train_file))
        X_test = sparse.load_npz("./vectorizers/{}".format(test_file))

        for classifier in models: 
            print("Training {}...".format(type(classifier[0]).__name__))
            begin_time = datetime.datetime.now()
            if classifier[1]:
                # Train with grid search with cross-validation if parameters are provided
                grid = GridSearchCV(estimator=classifier[0], param_grid=classifier[1], cv=3,
                                    n_jobs=-1)
                # Scale the dataset if needed, else, use the normal dataset
                grid.fit(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train), y_train)
                # Select the best classifier obtained through grid search
                clf = grid.best_estimator_
            else:
                # Default train if no parameters provided
                clf = classifier[0].fit(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train), y_train)
            measured_time_train = datetime.datetime.now() - begin_time

            # Predict with the test set and labels
            begin_time = datetime.datetime.now()
            y_pred = clf.predict(X_test if not classifier[2] else classifier[2][0].fit_transform(X_test))
            measured_time_test = datetime.datetime.now() - begin_time

            # Save all the results in a dictionary
            train_results = classification_report(y_train, clf.predict(X_train if not classifier[2] else classifier[2][0].fit_transform(X_train)), target_names=["non-depression", "depression"], output_dict=True)
            test_results = classification_report(y_test, y_pred, target_names=["non-depression", "depression"], output_dict=True)
            execution_results.append({"model": str(clf),
            "train_accuracy": train_results["accuracy"],
            "test_accuracy": test_results["accuracy"],
            "train_results": train_results,
            "test_results": test_results,
            "measured_time_train": str(measured_time_train),
            "measured_time_test": str(measured_time_test),
            "type": "{} with ({}) n-grams".format(vectorizer_type.upper(), "-".join(re.findall(r'\d+', train_file)))})

            # Save the model
            if test_results["accuracy"] > best_score:
                best_score = test_results["accuracy"]
                best_clf = clf

Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...
Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...
Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...
Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...
Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...
Training MultinomialNB...
Training ComplementNB...
Training BernoulliNB...
Training SGDClassifier...


In [11]:
import pandas as pd

# Transform the results' dictionary into a pandas dataframe
execution_results_df = pd.DataFrame.from_dict(execution_results, orient="columns")

In [12]:
# Save the dataframe to excel and json
execution_results_df.to_excel("execution_results.xlsx")
execution_results_df.to_json("execution_results.json", orient="records")

In [None]:
if best_clf is not None:
    dump(best_clf, "./best_model.joblib")

In [None]:
clf = load("./best_model.joblib")
test_sentence = ""
predicted_result = clf.predict(test_sentence)

if hasattr(clf, "decision_function"):
    predicted_proba = clf.decision_function([test_sentence]) # Probability for each class
else:
    predicted_proba = clf.predict_proba([test_sentence])[:, 1] # The more positive, the more depressive and viceversa

print(predicted_result)
print(predicted_proba)