In [12]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
#from imblearn.over_sampling import SMOTE
from math import exp

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix

## Feature selection by correlation

In [2]:
df = pd.read_csv("train_data.csv").rename({"deaths_per_100mil_vehicle_miles": "deaths_per_100mil_miles"}, axis=1)
df.corr(numeric_only=True)["label"].sort_values(key=lambda x: abs(x), ascending=False)

label                           1.000000
rank_incident_severity          0.119142
non_violent_rate                0.096296
number_of_vehicles_involved     0.078499
violent_rate                    0.072924
in_state                        0.070108
deaths_per_100k                 0.055592
rank_insured_education_level    0.050458
policy_amt                     -0.049461
umbrella_limit                  0.049313
csl_amt                        -0.048822
incident_hour_of_the_day        0.040186
deaths_per_100mil_miles         0.036632
witnesses                       0.035485
bodily_injuries                 0.028191
capital_gains                  -0.024975
avg_car_price                   0.013819
capital_loss                   -0.013543
claim_capital_percent           0.003998
months_as_customer             -0.003483
Name: label, dtype: float64

In [3]:
# For my thid party feature, see if they are self correlated

x = df.corr(numeric_only=True)
columns_for_corr = ["policy_amt", "csl_amt", "violent_rate", "non_violent_rate", "deaths_per_100k", "deaths_per_100mil_miles", "label"]
x.where((x > 0.5) & (x < 1)).loc[columns_for_corr, columns_for_corr].round(2)

Unnamed: 0,policy_amt,csl_amt,violent_rate,non_violent_rate,deaths_per_100k,deaths_per_100mil_miles,label
policy_amt,,0.99,,,,,
csl_amt,0.99,,,,,,
violent_rate,,,,0.85,0.7,0.72,
non_violent_rate,,,0.85,,0.83,0.73,
deaths_per_100k,,,0.7,0.83,,0.96,
deaths_per_100mil_miles,,,0.72,0.73,0.96,,
label,,,,,,,


In [4]:
# Because many are, only select the ones more correlated in abs to the label
# ex. policy_amt and csl_amt are highly correlated and policy_amt is higher corr to take it

x.loc[:, ["label"]].sort_values("label", ascending=False, key=lambda x: abs(x))

Unnamed: 0,label
label,1.0
rank_incident_severity,0.119142
non_violent_rate,0.096296
number_of_vehicles_involved,0.078499
violent_rate,0.072924
in_state,0.070108
deaths_per_100k,0.055592
rank_insured_education_level,0.050458
policy_amt,-0.049461
umbrella_limit,0.049313


In [3]:
train_data = pd.read_csv("train_data.csv")
y_train = train_data["label"]
train_data.drop(["label"], axis=1, inplace=True)

test_data = pd.read_csv("test_data.csv")
y_test = test_data["label"]
test_data.drop(["label"], axis=1, inplace=True)

drop_by_corr = False
if drop_by_corr:
    for df in [train_data, test_data]:
        df.drop(["csl_amt", "deaths_per_100k", "violent_rate"], axis=1, inplace=True)

print(train_data.shape, y_train.shape, test_data.shape, y_test.shape)

(822, 21) (822,) (178, 21) (178,)


## Runs without dropping by corr

In [4]:
one_hot_cols = ["insured_relationship", "authorities_contacted"]
scaler_cols = list(set(train_data.columns) - set(one_hot_cols))

ct = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(), one_hot_cols),
        ("scaler", StandardScaler(), scaler_cols)
    ]
)

X_train = ct.fit_transform(train_data)
feature_names = ct.get_feature_names_out(train_data.columns)
X_test = ct.transform(test_data)

In [5]:
def run_metrics(model, train=X_train, test=X_test):
    train_preds = model.predict(train)
    print("Train metrics")
    for name, metric in zip(['accuracy', 'auroc', 'f1'], [accuracy_score, roc_auc_score, f1_score]):
        print(name, round(metric(y_train, train_preds), 3))

    test_preds = model.predict(test)
    print("\nTest metrics")
    for name, metric in zip(['accuracy', 'auroc', 'f1'], [accuracy_score, roc_auc_score, f1_score]):
        print(name, round(metric(y_test, test_preds), 3))

In [37]:
#class_weights = compute_class_weight(class_weight="balanced", classes=[0, 1], y=y_train)
lr = LogisticRegression(penalty="l2", class_weight="balanced", C=1, solver="liblinear")
lr.fit(X_train, y_train)
importances = [(feature, round(exp(weight), 3)) for feature, weight in zip(feature_names, lr.coef_[0])]
#sorted(importances, key=lambda y: -y[1])[:10]

run_metrics(lr)

Train metrics
accuracy 0.617
auroc 0.631
f1 0.458

Test metrics
accuracy 0.545
auroc 0.571
f1 0.409


In [39]:
# Consider undoing one-hot encoding for decision tree
dt = DecisionTreeClassifier(max_depth=5, class_weight="balanced", random_state=653)
dt.fit(X_train, y_train)
importances = {feature: importance for feature, importance in zip(feature_names, dt.feature_importances_)}
sorted_importances = sorted(list([x for x in importances.items() if x[1] > 0]), key=lambda y: -y[1])
#sorted_importances
run_metrics(dt)

Train metrics
accuracy 0.837
auroc 0.825
f1 0.707

Test metrics
accuracy 0.747
auroc 0.676
f1 0.516


### Retry using importance and fewer features

In [6]:
# Get down features
#scaler_important_features = [feature[0].split("__")[1] for feature in sorted_importances if not feature[0].startswith("one_hot")][:7]
scaler_important_features = ["rank_incident_severity", "months_as_customer", "incident_hour_of_the_day", "witnesses", "violent_rate"]
cat_important_features = ["authorities_contacted", "insured_relationship"] # they appear in the list
train_data_importance = train_data[scaler_important_features + cat_important_features]
train_data_importance.columns

Index(['rank_incident_severity', 'months_as_customer',
       'incident_hour_of_the_day', 'witnesses', 'violent_rate',
       'authorities_contacted', 'insured_relationship'],
      dtype='object')

In [7]:
ct_importance = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(), cat_important_features), # don't need to encode for decision tree
        ("scaler", StandardScaler(), scaler_important_features)
    ]
)

X_train_importance = ct_importance.fit_transform(train_data_importance)
feature_names_importance = ct_importance.get_feature_names_out(train_data_importance.columns)

test_data_importance = test_data[scaler_important_features + cat_important_features]
X_test_importance = ct_importance.transform(test_data_importance)

X_test_importance.shape, X_test_importance.shape

((178, 16), (178, 16))

In [48]:
# import numpy as np
# np.save("importance_train_data.npy", X_train_importance)
# np.save("importance_test_data.npy", X_test_importance)

In [53]:
# Shows a benefit for logistic regression, train and test are close

lr_importance = LogisticRegression(penalty="l1", class_weight="balanced", C=0.1, solver="liblinear")
lr_importance.fit(X_train_importance, y_train)
run_metrics(lr_importance, X_train_importance, X_test_importance)

Train metrics
accuracy 0.586
auroc 0.616
f1 0.444

Test metrics
accuracy 0.573
auroc 0.582
f1 0.415


In [9]:
# Still better

dt_importance = DecisionTreeClassifier(max_depth=2, class_weight="balanced", random_state=653)
dt_importance.fit(X_train_importance, y_train)
run_metrics(dt_importance, X_train_importance, X_test_importance)

Train metrics
accuracy 0.81
auroc 0.769
f1 0.641

Test metrics
accuracy 0.815
auroc 0.751
f1 0.629


In [54]:
# from sklearn.feature_selection import RFE, SequentialFeatureSelector
# selector = SequentialFeatureSelector(lr)
# selector = RFE(lr, n_features_to_select=10, step=1)
# selector.fit(X_train, y_train)
# print(selector.support_)

## Random Forest

In [None]:
# rf = RandomForestClassifier(bootstrap=True, class_weight="balanced", random_state=653)
# distributions=dict(max_depth=[2, 3, 4, 5], max_features=["sqrt", 5, None]) 
# clf = GridSearchCV(rf, distributions, scoring=f1_score, verbose=0)
# search = clf.fit(X_train, y_train)
# search.best_params_

In [10]:
rf = RandomForestClassifier(max_depth=4, max_features=5, bootstrap=True, class_weight="balanced", random_state=653)
rf.fit(X_train_importance, y_train)
run_metrics(rf, X_train_importance, X_test_importance)

Train metrics
accuracy 0.82
auroc 0.792
f1 0.668

Test metrics
accuracy 0.815
auroc 0.758
f1 0.637



In [110]:
# 4 and 5 got the best score

for depth in [2, 3, 4, 5]:
    for num_f in ["sqrt", 5, None]:
         print(depth, num_f)
         print("---------")
         rf = RandomForestClassifier(max_depth=depth, max_features=num_f, bootstrap=True, class_weight="balanced", random_state=653)
         rf.fit(X_train_importance, y_train)
         run_metrics(rf, X_train_importance, X_test_importance)
         print()

2 sqrt
---------
Train metrics
accuracy 0.617
auroc 0.691
f1 0.518

Test metrics
accuracy 0.618
auroc 0.663
f1 0.5

2 5
---------
Train metrics
accuracy 0.605
auroc 0.683
f1 0.51

Test metrics
accuracy 0.618
auroc 0.663
f1 0.5

2 None
---------
Train metrics
accuracy 0.81
auroc 0.769
f1 0.641

Test metrics
accuracy 0.815
auroc 0.751
f1 0.629

3 sqrt
---------
Train metrics
accuracy 0.663
auroc 0.718
f1 0.547

Test metrics
accuracy 0.629
auroc 0.664
f1 0.5

3 5
---------
Train metrics
accuracy 0.707
auroc 0.741
f1 0.575

Test metrics
accuracy 0.691
auroc 0.698
f1 0.538

3 None
---------
Train metrics
accuracy 0.814
auroc 0.776
f1 0.65

Test metrics
accuracy 0.809
auroc 0.747
f1 0.622

4 sqrt
---------
Train metrics
accuracy 0.741
auroc 0.761
f1 0.603

Test metrics
accuracy 0.73
auroc 0.717
f1 0.564

4 5
---------
Train metrics
accuracy 0.82
auroc 0.792
f1 0.668

Test metrics
accuracy 0.815
auroc 0.758
f1 0.637

4 None
---------
Train metrics
accuracy 0.822
auroc 0.787
f1 0.665

Test met

In [19]:
y_pred = rf.predict(X_test_importance)
confusion_matrix(y_test, y_pred, labels=rf.classes_, normalize='true')

array([[0.87218045, 0.12781955],
       [0.35555556, 0.64444444]])

## Runs with dropping by corr

In [None]:
train_data = pd.read_csv("train_data.csv")
y_train = train_data["label"]
train_data.drop(["label"], axis=1, inplace=True)

test_data = pd.read_csv("test_data.csv")
y_test = test_data["label"]
test_data.drop(["label"], axis=1, inplace=True)

drop_by_corr = True
if drop_by_corr:
    for df in [train_data, test_data]:
        df.drop(["csl_amt", "deaths_per_100k", "violent_rate"], axis=1, inplace=True)

print(train_data.shape, y_train.shape, test_data.shape, y_test.shape)

In [56]:
one_hot_cols = ["insured_relationship", "authorities_contacted"]
scaler_cols = list(set(train_data.columns) - set(one_hot_cols))

ct = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(), one_hot_cols),
        ("scaler", StandardScaler(), scaler_cols)
    ]
)

X_train = ct.fit_transform(train_data)
feature_names = ct.get_feature_names_out(train_data.columns)
X_test = ct.transform(test_data)

In [71]:
#class_weights = compute_class_weight(class_weight="balanced", classes=[0, 1], y=y_train)
lr = LogisticRegression(penalty="l1", class_weight="balanced", C=10, solver="liblinear")
lr.fit(X_train, y_train)
importances = [(feature, round(exp(weight), 3)) for feature, weight in zip(feature_names, lr.coef_[0])]
sorted(importances, key=lambda y: -y[1])[:10]

[('scaler__non_violent_rate', 1.469),
 ('one_hot__authorities_contacted_Other', 1.364),
 ('one_hot__insured_relationship_other-relative', 1.288),
 ('scaler__rank_incident_severity', 1.245),
 ('scaler__umbrella_limit', 1.162),
 ('one_hot__authorities_contacted_Ambulance', 1.144),
 ('one_hot__insured_relationship_not-in-family', 1.141),
 ('scaler__rank_insured_education_level', 1.106),
 ('scaler__in_state', 1.105),
 ('scaler__witnesses', 1.103)]

In [72]:
run_metrics(lr)

Train metrics
accuracy 0.616
auroc 0.63
f1 0.457

Test metrics
accuracy 0.556
auroc 0.585
f1 0.423


In [65]:
# Consider undoing one-hot encoding for decision tree
dt = DecisionTreeClassifier(class_weight="balanced", max_depth=5)
dt.fit(X_train, y_train)
importances = {feature: importance for feature, importance in zip(feature_names, dt.feature_importances_)}
sorted(list([x for x in importances.items() if x[1] > 0]), key=lambda y: -y[1])

[('scaler__rank_incident_severity', 0.6340786820967172),
 ('scaler__witnesses', 0.0563592792156414),
 ('scaler__avg_car_price', 0.04995636508754498),
 ('scaler__capital_gains', 0.04614315130601874),
 ('scaler__deaths_per_100mil_vehicle_miles', 0.045277065273033636),
 ('scaler__umbrella_limit', 0.038127125255876505),
 ('scaler__claim_capital_percent', 0.03627865915872157),
 ('scaler__months_as_customer', 0.031300575903479004),
 ('scaler__incident_hour_of_the_day', 0.02867029332903156),
 ('scaler__non_violent_rate', 0.022622122801992783),
 ('scaler__rank_insured_education_level', 0.011186680571942739)]

In [66]:
run_metrics(dt)

Train metrics
accuracy 0.82
auroc 0.821
f1 0.692

Test metrics
accuracy 0.747
auroc 0.684
f1 0.526


## Imbalanced learning