In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import sklearn
from dmgpred.train import get_preprocessor
from dmgpred.utils.loading import load_data
from patsy import dmatrices
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    matthews_corrcoef,
)
from sklearn.model_selection import train_test_split
from statsmodels.discrete.discrete_model import MNLogit

sklearn.set_config(transform_output="pandas")

In [None]:
data = load_data(data_dir="../data/", processed=True)

X = data["X_train"]
y = data["y_train"]

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
preprocessor = get_preprocessor(X_train)
preprocessor.fit(X_train, y_train)
preprocessor

In [None]:
selector = preprocessor.steps[-1][1]
pd.Series(selector.scores_, index=selector.feature_names_in_).sort_values().plot.barh(
    figsize=(10, 5)
)

In [None]:
X_train_ = preprocessor.transform(X_train)
X_train_.info()

## Simple Logistic Regression Model


In [None]:
formula = "damage_grade ~ " + " + ".join(X_train_.columns)

endog_train, exog_train = dmatrices(
    formula,
    data=pd.concat([y_train, X_train_], axis=1),
    return_type="dataframe",
)

endog_test, exog_test = dmatrices(
    formula,
    data=pd.concat([y_test, preprocessor.transform(X_test)], axis=1),
    return_type="dataframe",
)

In [None]:
exog_train

In [None]:
res = MNLogit(endog_train, exog_train).fit()

In [None]:
res.summary()

In [None]:
y_pred = res.predict(exog_test).idxmax(axis=1)
y_pred

In [None]:
matthews_corrcoef(endog_test["damage_grade"], y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(endog_test["damage_grade"], y_pred)

In [None]:
print(classification_report(endog_test["damage_grade"], y_pred))