# Random Forest
## Predicting bankruptcy

In [None]:
import gzip
import json
import pickle

import ipywidgets as widgets
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from ipywidgets import interact
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from teaching_tools.widgets import ConfusionMatrixWidget

In [None]:
# Load data file
with gzip.open("data/taiwan-bankruptcy-data.json.gz", "r") as f:
        taiwan_data = json.load(f)

print(type(taiwan_data))

In [None]:
# Key names
taiwan_data_keys = taiwan_data.keys()
print(taiwan_data_keys)

In [None]:
# Let's see how many companies there are
n_companies = len(taiwan_data["observations"])
print(n_companies)

In [None]:
# Let's see the number of features
n_features = len(taiwan_data["observations"][0])
print(n_features)

In [None]:
# Wrangle function to clean and preprocess the data
def wrangle(filename):
    
    # Open compressed file, load into dictionary
    with gzip.open(filename, "r") as f:
        data = json.load(f)
        
    # Load dictionary into DataFrame, set index
    df = pd.DataFrame().from_dict(data["observations"]).set_index("id")
    
    return df

In [None]:
df = wrangle("data/taiwan-bankruptcy-data.json.gz")
print("df shape:", df.shape)
df.head()

In [None]:
# Let's check the NaNs by column
nans_by_col = df.isnull().sum()
print("nans_by_col shape:", nans_by_col.shape)
nans_by_col.head()

In [None]:
# Let's check if the classes are balanced
df["bankrupt"].value_counts(normalize=True).plot(kind="bar", 
                                                xlabel="Bankrupt", 
                                                ylabel="Frequency",
                                                title="Class Balance"
                                               )
# As it can be seen, the class "False" (companies that are not in bankruptcy) are far more than the ones "True"

In [None]:
# Vertical split
target = "bankrupt"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
# Horizontal split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Let's do a random oversampler to have a better data to work with
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

In [None]:
# Let's create a random forest model with the oversampled data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_over, y_train_over)

In [None]:
# Let's perform a cross validation (kfolds) to better check the accuracy of the model
cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_scores)

In [None]:
# Let's do some hyperparameter tuning with the estimators and the max_depth of the forest
params = {
    "n_estimators": range(25, 100, 25),
    "max_depth": range(10, 50, 10) 
}
params

In [None]:
# I will create a GridSearch to include the classifier and the hyperparameter grid
model = GridSearchCV(
    clf,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)
model

In [None]:
# Let's fit the model with the oversampled data
model.fit(X_train_over, y_train_over)

In [None]:
# Let's put the results of the model in a DataFrame, to see which set of hyperparameters performs better
cv_results = pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score")
cv_results.head(5)

In [None]:
# I choose the best set of hyperparameters
best_params = model.best_params_
print(best_params)

In [None]:
# Now let's check the accuracy of the model comparing it to the test data
acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

print("Model Training Accuracy:", round(acc_train, 4))
print("Model Test Accuracy:", round(acc_test, 4))
# As we can see, the accuracy of the test model is of 97,64%

In [None]:
# Let's plot the confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);

In [None]:
# Now, it is normal that with a data that is so imbalanced we can have that accuracy by just predicting that all
# the companies are False (no bankruptcy). However, depending on the use the model will have, we will be interested
# in different scores (mainly precision or recall). The model not only beats the baseline, but also let the user
# choose which scores he prefers depending on the case of use
class_report = classification_report(y_test, model.predict(X_test))
print(class_report)

In [None]:
# Let's get the importances and plot them sorted
features = X_train_over.columns
importances = model.best_estimator_.feature_importances_
feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.tail(10).plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")

In [None]:
# At last, I will save the model
# Save model
with open("model-5-5.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
# If I want to predict another dataset similar to the one that I had used, I could import the model and use it
from my_predictor_assignment import make_predictions

# Generate predictions
y_test_pred = make_predictions(
    data_filepath="data/taiwan-bankruptcy-data-test-features.json.gz",
    model_filepath="model-5-5.pkl",
)

print("predictions shape:", y_test_pred.shape)
y_test_pred.head()