In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.ensemble import GradientBoostingClassifier

columns = ['ticket_id', 'city', 'state', 'violation_code', 'violation_description', 
          'disposition', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount', 'clean_up_cost',
          'judgment_amount', 'payment_amount', 'balance_due', 'payment_date', 'payment_status',
          'collection_status', 'grafitti_status', 'compliance_detail', 'compliance']
dtypes = ['int', 'object', 'object', 'object', 'object', 'object', 'float64', 'float64', 'float64',
         'float64', 'float64', 'float64', 'float64', 'float64', 'object', 'object', 'object',
         'object', 'object', 'float64']



df_train = pd.read_csv('./train.csv', index_col ='ticket_id', usecols=columns, dtype=dict(zip(columns, dtypes)))
df_train = df_train[~np.isnan(df_train['compliance'])]


target = 'compliance'
features = ['admin_fee', 'state_fee', 'late_fee', 'discount_amount',
            'clean_up_cost', 'judgment_amount', 'state']

df_train = df_train[features+[target]]
df_train = pd.get_dummies(df_train, columns=['state'])

y = df_train[target]
X = df_train.drop(target, axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y)

gb = GradientBoostingClassifier()

param = {
    'learning_rate' : np.logspace(-3, 1),
    'loss' : ['deviance', 'exponential'],
    'n_estimators' : list(map(int, np.geomspace(1, 100, 5)))
}

gs = GridSearchCV(gb, param_grid=param, scoring='roc_auc', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
gs.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 27.8min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 35.5min finished


GradientBoostingClassifier(learning_rate=0.49417133613238334, n_estimators=31)

In [3]:
gs.best_score_

0.7608303881966041

In [5]:
from sklearn.metrics import roc_auc_score
gb = GradientBoostingClassifier(learning_rate=0.49417133613238334, n_estimators=31).fit(X_train, y_train)
roc_auc_score(y_test, gb.predict(X_test))

0.5565947172819978