In [1]:
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
# importing data
df = pd.read_csv(r"C:\Users\HP\Desktop\archive\data.csv")


# Exploring and Wrangling the data

In [None]:
# Exploring the data
# df.head()
df.shape

# Check for missing values
# df.isnull().sum()

# Summary statistics
# df.describe(include="all")

We have 43405 rows and 66 columns, 5 years worth of data, we only take the data from the 5th year for our model

In [None]:
# Taking dataset from the 5th year only
df = df[df["year"] == 5]

# Droping the year column after taking the data from year 5 only
df = df.drop(columns=["year"])

It seems that the class is our target, meaning class 0 means 'not bankrupt' and class 1 means 'bankrupt', so we change it to boolean data type


In [None]:
df["class"]= df["class"].astype(bool)
df.head()

In [None]:
# naming the index as "company id"
df.index.name = 'Company_ID'
df.head()

We need to check if there are any missing data in the dataset, so we create a Series where the index contains the name of the columns in df and the values are the number of NaNs in each column. We assign the result to nans_by_col. Neither the Series itself nor its index require a name.

In [None]:
nans_by_col = df.isna().sum()
print("nans_by_col shape:", nans_by_col.shape)
nans_by_col.head()

In [None]:
# We check if the data imbalanced by creating a bar chart that shows shows it
df["class"].value_counts(normalize=True).plot(
    kind="bar",
    xlabel="Bankrupt",
    ylabel="Frequency",
    title="Class Balance",
);

In [None]:
# Spliting the data into our feature matrix X and target vector y.Our target is "class"
target = "class"
X = df.drop(columns=[target])
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

We divide our dataset into training and test sets using a randomized split.The test set would be 20% of our data. We set random_state to 42.

In [None]:
X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

For resampling, we create a new feature matrix X_train_over and target vector y_train_over by performing random over-sampling on the training data.we set the random_state to 42.

In [None]:
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

We proceed to Building the Model by first iterating. We create a classifier "clf "that can be trained on (X_train_over, y_train_over). Using an ensemble predictor.

In [None]:
clf = make_pipeline(SimpleImputer(), RandomForestClassifier())

Remember while we're doing this that we only want to be looking at the positive class. Here, the positive class is the one where the companies really did go bankrupt

Next we perform cross-validation with our classifier, using the over-sampled training data. We want five folds, so set cv to 5. We also want to speed up training, to set n_jobs to -1, We use our CV scores to evaluate different classifiers. Choosing the one that gives us the best scores.

In [None]:
cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_scores)

We create a dictionary "params" with the range of hyperparameters that we want to evaluate for our classifier. we check the scikit-learn documentation for predictor ideas on which hyperparameters to tune.

The classifier we built is a pipeline with multiple steps so we include include the step name in the keys of your params dictionary

In [None]:
params = {
    "simpleimputer__strategy": ["mean", "median"],
    "randomforestclassifier__n_estimators": range(25, 100, 25),
    "randomforestclassifier__max_depth": range(10, 50, 10)
}
params

Next we create a GridSearchCV named model that includes our classifier and hyperparameter grid. we to set cv to 5, n_jobs to -1, and verbose to 1.

In [None]:
model = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1, verbose=1)

In [None]:
# Fitting our model to the over-sampled training data.
model.fit(X_train_over, y_train_over)

We extract the cross-validation results from our model, and load them into a DataFrame named cv_results, to know which set of hyperparameters led to the best performance

In [None]:
cv_results = pd.DataFrame(model.cv_results_)
cv_results.sort_values("rank_test_score").head()

In [None]:
# Extract best hyperparameters
model.best_params_

In [None]:
# Evaluating the model by testing the quality of our model by calculating accuracy scores for the training and test data.
acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

print("Model Training Accuracy:", round(acc_train, 4))
print("Model Test Accuracy:", round(acc_test, 4))

When dealing with imbalanced data, "good" accuracy scores alone don’t tell us much about model performance. Instead of just focusing on what the model got right or wrong, we should examine how its predictions differ across the two classes. To do this, let's plot a confusion matrix that shows how our best model performs on the validation set.

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);

In [None]:
# Generating a classification report for the model's performance on the test data and assigning it to class_report.
class_report = classification_report(y_test, model.predict(X_test))
print(class_report)

Creating a horizontal bar chart with the 10 most important features for our model.

In [None]:
# Get feature names from training data
features = X_train_over.columns

# Extracting importances from model
importances = model.best_estimator_.named_steps[
    "randomforestclassifier"
                                               ].feature_importances_

# Creating a series with feature names and importances
feat_imp = pd.Series(importances, index=features).sort_values()

# Plot 10 most important features
feat_imp.tail(10).plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");

In [None]:
# Saving the model as a pickle file
import pickle
with open("model_poland.pkl", "wb") as f:
    pickle.dump(model, f)

In [3]:
from my_predictor_poland import make_predictions
# Generate predictions
y_test_pred = make_predictions(
    data_filepath=r"C:\Users\HP\Desktop\archive\data.csv",  # Use raw string
    model_filepath="model_poland.pkl"
)
print("predictions shape:", y_test_pred.shape)
y_test_pred.head()

predictions shape: (5910,)


Company_ID
37495    False
37496    False
37497    False
37498    False
37499    False
Name: class, dtype: bool

I hope the analysis can help any data science enthusiast