In [1]:
# https://scikit-learn.org/stable/modules/permutation_importance.html

In [2]:
# Misc
RANDOM_STATE = 42

In [3]:
import os
import json
import joblib # https://stackoverflow.com/questions/56107259/how-to-save-a-trained-model-by-scikit-learn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
  )

# Data splitting

In [4]:
!gdown --fuzzy https://drive.google.com/file/d/17qgFjTIUgxsiURgGjVC6WlEFn6Wl-OeJ/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=17qgFjTIUgxsiURgGjVC6WlEFn6Wl-OeJ
To: /content/cleaned_bmarket.csv
  0% 0.00/2.65M [00:00<?, ?B/s]100% 2.65M/2.65M [00:00<00:00, 27.1MB/s]


In [5]:
df = pd.read_csv("./cleaned_bmarket.csv")

In [6]:
y = df['Subscription Status']
X = df.drop('Subscription Status', axis=1)
X = pd.get_dummies(X, drop_first=True) # PLEASE rememeber to research label encoder

In [7]:
print(X.shape, y.shape)

(40787, 26) (40787,)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Misc

In [9]:
class ModelWrapper:
    def __init__(self, title, model, hyperparameters, param_grid, cv):
        self.title = title
        self.model = model(**hyperparameters)
        self.param_grid = param_grid
        self.cv = cv

        self.best_epoch = None
        self.best_params = None
        self.best_model = None

    def run_gridsearch(self, X, y):
        gs = GridSearchCV(
            estimator=self.model,
            param_grid=self.param_grid,
            cv=self.cv,
            scoring='f1', # REMEBR TO OPTIMISE BASED ON CLASS IMBALANCE!!
            n_jobs=-1,
            verbose=2,
            return_train_score=True
        )

        gs.fit(X, y) # PLSSSS TRAINNNNNN :3

        self.best_params = gs.best_params_
        self.best_model = gs.best_estimator_
        if hasattr(gs.best_estimator_, 'n_estimators'):
            self.best_epoch = gs.best_estimator_.n_estimators
        else:
            self.best_epoch = getattr(gs.best_estimator_, 'n_iter_')

    def write_info_to_disk(self, X, y, folder_path):
        save_dir = os.path.join(folder_path, self.title)
        os.makedirs(save_dir, exist_ok=True)

        model_path = os.path.join(save_dir, f"{self.title}.pkl")
        joblib.dump(self.best_model, model_path)

        params_path = os.path.join(save_dir, "parameters.json")
        with open(params_path, 'w') as f:
            json.dump(self.best_params, f, indent=4)

        y_pred = self.best_model.predict(X)

        cm = confusion_matrix(y, y_pred)

        plt.figure(figsize=(8, 6))
        ax = sns.heatmap(cm, annot=True, fmt='d')
        labels = ['False', 'True']
        plt.title(f'Confusion Matrix: {self.title}')
        ax.set_xticklabels(labels)
        ax.set_yticklabels(labels)
        ax.set_ylabel('Actual')
        ax.set_xlabel('Predicted')

        plot_path = os.path.join(save_dir, "cmatrix.png")
        plt.savefig(plot_path)
        plt.close()

        test_error = pd.concat([measure_error(y, y_pred, 'test')], axis=1)
        test_error.to_csv(os.path.join(save_dir, "test_error.csv"))

In [10]:
###################
# Hyperparameters #
###################

# Random Forest: {
rf_params = {
    "n_jobs": 1,
    "oob_score": True,
    "warm_start": False,
    "random_state": RANDOM_STATE
}

rf_gs_params = {
    "max_features": ["sqrt", "log2", None],
    "min_samples_split": [2, 5, 10],
    'max_samples': [0.5, 0.75, None,],
    "class_weight": ["balanced", None],
    "max_depth": [10, 20, None],
    "n_estimators": [500, 1000, 1500],
}
# }

In [11]:
def measure_error(y_true, y_pred, label):
    return pd.Series({
        'accuracy':accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)},
        name=label)

# Ensemble models

### Random Forest

**Optimization Methods**


*   Grid Search
*   Adjusting decision threshold




In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf_ens = ModelWrapper(
    title = "RandomForestClassifier",
    model = RandomForestClassifier,
    hyperparameters = {**rf_params},
    param_grid = {**rf_gs_params},
    cv = 2
)

In [14]:
clfs = [
    rf_ens
]

In [15]:
# IMPORTANT: Remeber to fix for parallel processing in kedro! ^^

for model in clfs:
    model.run_gridsearch(X_train, y_train)
    model.write_info_to_disk(X_test, y_test, "/content")

Fitting 2 folds for each of 162 candidates, totalling 324 fits
