In [None]:
from src.models import train_model
import pandas
import pickle
import os

Defining constants

In [None]:
DATA_PATH = "../data/"

Loading raw dataset

In [None]:
training_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/training_set.csv"))
validation_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/validation_set.csv"))
testing_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/testing_set.csv"))

Choosing the best baseline model

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import make_scorer


In [None]:
models = {
    "lg": LogisticRegression(),
    "gnb": GaussianNB(),
    "bnb": BernoulliNB(),
}

hyperparams = {
    "lg": {},
    "gnb": {},
    "bnb": {}
}

Feature Selection

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import recall_score

metric = make_scorer(recall_score(average='macro'), greater_is_better=True)
feature_importances = {}

for model_name, model in models.items():
    cv_results = RFECV(
        estimator=model,
        step=2,
        min_features_to_select=4,
        cv=StratifiedKFold(n_splits=5),
        n_jobs=-1,
        scoring=metric,
    )
    feature_importances[model_name] = {
        'important_features': cv_results.cv_results_
    }

feature_importances

Hyperparameter Tuning

In [None]:
output = {}
loss_function = make_scorer()

for model_name, model in models.items():
    score, best_model = train_model.fine_tune_model(
        k_cross=5,
        training_set=training_set,
        target_variable="category",
        hyperparams=hyperparams[model_name],
        model=model,
        loss_function_or_scorer_metric=metric,
    )
    output[model_name] = {
        "best_model": best_model,
        "best_score": score
    }

Visualizing output and choosing best baseline model

In [None]:
output

In [None]:
chosen_model = output['gnb']['best_model']

Testing model using cross-validation on test set

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

X, Y = testing_set.drop(columns=['category']), testing_set['category']
cv = cross_validate(
    estimator=chosen_model,
    scoring=metric,
    X=testing_set,
    cv=StratifiedKFold(n_splits=5, shuffle=True)
)
print('test metric score: %s' % cv['test_score'])

Estimating model performance according to baseline metrics

Saving baseline model

In [None]:
pickle.dump(chosen_model, open('../models/classifier.pkl', mode='wb'))