In [5]:
import sys 
sys.path.append("../")
sys.path.append("../../../")

In [6]:
from src.models import train_model
import pandas
import pickle
import os
from baseline_requirements import metrics 
from text_classification import text

Defining constants

In [7]:
DATA_PATH = "../data/"

Loading raw dataset

In [9]:
training_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/training_set.csv"))
validation_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/validation_set.csv"))
testing_set = pandas.read_csv(os.path.join(DATA_PATH, "/processed_data/testing_set.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '/processed_data/training_set.csv'

Converting text datasets to TF / IDF Vectors

In [None]:
training_set = text.TFIDFVectorizedDataset(text_data=training_set).get_vectorized_df()
validation_set = text.TFIDFVectorizedDataset(text_data=validation_set).get_vectorized_df()
testing_set = text.TFIDFVectorizedDataset(text_data=testing_set).get_vectorized_df()

Choosing the best baseline model

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer
import numpy

In [None]:
models = {
    "lg": LogisticRegression(),
    "mnb": MultinomialNB(),
    "svm": LinearSVC(),
}

hyperparams = {
    "lg": {
        'penalty' : ['l1', 'l2', 'elasticnet', 'none'], # type of regularization
        'C' : numpy.logspace(-4, 4, 20),  # C parameter
        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
        'max_iter' : [100, 1000,2500, 5000] # maximum number of iterations
    },
    "mnb": {
        'alpha': [0.1, 0.5, 1.0, 1.5], # alpha regularization parameter
        'fit_prior': [True, False]
    },
    "svm": {
        'C': 1.0,  # Regularization strength
        'loss': 'squared_hinge',  # Loss function
        'dual': [True, False],  # Whether to solve the dual or primal problem
        'fit_intercept': [True, False],  # Whether to calculate the intercept
        'max_iter': 1000,  # Maximum number of iterations for the solver
    }
}

Feature Selection using Training Set

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import recall_score

metric = make_scorer(recall_score(average='macro'), greater_is_better=True)
feature_importances = {}

X_train = training_set.drop(columns=['category'])
Y_train = training_set['category']

for model_name, model in models.items():
    # Computing Feature importances using Recursive Feature Elimination (RFE)
    cv_results = RFECV(
        estimator=model,
        step=2,
        min_features_to_select=4,
        cv=StratifiedKFold(n_splits=5),
        n_jobs=-1,
        scoring=metric,
    )
    cv_results.fit(X_train, Y_train)

    # storing output important features 
    feature_importances[model_name] = {
        'important_features': cv_results.cv_results_
    }

feature_importances

Hyperparameter Tuning using Validation Set

In [None]:
output = {}

for model_name, model in models.items():
    score, best_model = train_model.fine_tune_model(
        k_cross=5,
        training_set=validation_set,
        target_variable="category",
        hyperparams=hyperparams[model_name],
        model=model,
        loss_function_or_scorer_metric=metric,
    )
    output[model_name] = {
        "best_model": best_model,
        "best_score": score
    }

Visualizing output and choosing best baseline model

In [None]:
output

Picking the best model, based on a given score from the HP output

In [None]:
chosen_model = sorted(output, lambda model: model['best_score'], reverse=True)[0]

Testing model using cross-validation on Testing set

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

X, Y = testing_set.drop(columns=['category']), testing_set['category']
cv = cross_validate(
    estimator=chosen_model,
    scoring=metric,
    X=testing_set,
    cv=StratifiedKFold(n_splits=5, shuffle=True)
)
print('test metric score: %s' % cv['test_score'])

Estimating model performance according to baseline metrics

In [None]:
print('meets expected metric: ', metrics.AVERAGED_WEIGHTED_RECALL <= cv['test_score'])

Saving baseline model

In [None]:
pickle.dump(chosen_model, open('../models/baseline_classifier.pkl', mode='wb'))