In [2]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from math import exp

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [3]:
train_data = pd.read_csv("train_data.csv")
y_train = train_data["label"]
train_data.drop(["label"], axis=1, inplace=True)

test_data = pd.read_csv("test_data.csv")
y_test = test_data["label"]
test_data.drop(["label"], axis=1, inplace=True)

print(train_data.shape, y_train.shape, test_data.shape, y_test.shape)

(822, 21) (822,) (178, 21) (178,)


In [4]:
one_hot_cols = ["insured_relationship", "authorities_contacted"]
scaler_cols = list(set(train_data.columns) - set(one_hot_cols))

ct = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(), one_hot_cols),
        ("scaler", StandardScaler(), scaler_cols)
    ]
)

X_train = ct.fit_transform(train_data)
feature_names = ct.get_feature_names_out(train_data.columns)
X_test = ct.transform(test_data)

In [5]:
def run_metrics(model):
    train_preds = model.predict(X_train)
    print("Train metrics")
    for metric in [accuracy_score, roc_auc_score, f1_score]:
        print(round(metric(y_train, train_preds), 3))

    test_preds = model.predict(X_test)
    print("\nTest metrics")
    for metric in [accuracy_score, roc_auc_score, f1_score]:
        print(round(metric(y_test, test_preds), 3))

In [7]:
#class_weights = compute_class_weight(class_weight="balanced", classes=[0, 1], y=y_train)
lr = LogisticRegression(penalty="l2", class_weight="balanced", C=100, solver="liblinear")
lr.fit(X_train, y_train)
importances = [(feature, exp(weight)) for feature, weight in zip(feature_names, lr.coef_[0])]
sorted(importances, key=lambda y: -y[1])[:10]

[('one_hot__authorities_contacted_Other', 1.606952331361755),
 ('scaler__deaths_per_100k', 1.4526666153813133),
 ('one_hot__insured_relationship_other-relative', 1.3386587190761983),
 ('one_hot__authorities_contacted_Ambulance', 1.3352140845475922),
 ('scaler__rank_incident_severity', 1.2458386670796933),
 ('scaler__non_violent_rate', 1.2426260147123753),
 ('one_hot__authorities_contacted_Fire', 1.2084795622922733),
 ('one_hot__insured_relationship_not-in-family', 1.1841325847608917),
 ('scaler__umbrella_limit', 1.161651257600443),
 ('one_hot__insured_relationship_wife', 1.1397528257211884)]

In [8]:
run_metrics(lr)

Train metrics
0.619
0.636
0.463

Test metrics
0.539
0.574
0.414


In [10]:
# Consider undoing one-hot encoding for decision tree
dt = DecisionTreeClassifier(class_weight="balanced", max_depth=3)
dt.fit(X_train, y_train)
importances = {feature: importance for feature, importance in zip(feature_names, dt.feature_importances_)}
sorted(list([x for x in importances.items() if x[1] > 0]), key=lambda y: -y[1])

[('scaler__rank_incident_severity', 0.8692569392767289),
 ('scaler__months_as_customer', 0.05572584401480137),
 ('scaler__violent_rate', 0.03394967631705321),
 ('scaler__avg_car_price', 0.022083386309404696),
 ('scaler__claim_capital_percent', 0.01898415408201161)]

In [121]:
# from sklearn.feature_selection import RFE, SequentialFeatureSelector
# selector = SequentialFeatureSelector(lr)
# selector = RFE(lr, n_features_to_select=10, step=1)
# selector.fit(X_train, y_train)
# print(selector.support_)

NameError: name 'dt' is not defined