## Gradient Booster Classifier with Grid Search

This assignment is based on a data challenge from the Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)). All data for this assignment has been provided through the [Detroit Open Data Portal](https://data.detroitmi.gov/), and was collated into two data files for use in training and validating the model: train.csv and test.csv

This demostrates using `GradientBoostingClassifier` and `GridSearchCV` from `sklearn`. The grid search is used to find some appropriate hyperparameters to use in the final model. 

In [1]:
import pandas as pd
import numpy as np

def gradient_predictor():

    # Custom code
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import train_test_split
    from sklearn import metrics   #Additional scklearn functions
    from sklearn.model_selection import cross_validate, GridSearchCV

    X_train = pd.read_csv('train.csv',encoding = 'ISO-8859-1', dtype={'zip_code' : object, 'non_us_str_code' : object, 'grafitti_status' : object})
    X_test = pd.read_csv('test.csv',encoding = 'ISO-8859-1')
    X_train = X_train[pd.notnull(X_train.compliance)]   

    # First we'll do the feature preparation
    
    # Drop Train only fields
    X_train = X_train.drop(['payment_amount', 'payment_date', 'payment_status', 'balance_due',
                                  'collection_status', 'compliance_detail'], axis=1)

    # Drop fields intutively too variable to carry information
    X_train = X_train.drop(['ticket_id', 'inspector_name', 'violator_name', 'violation_street_number', 
                            'violation_street_name', 'violation_zip_code', 'mailing_address_str_number', 
                            'mailing_address_str_number', 'mailing_address_str_name', 'city', 'state', 
                            'zip_code', 'non_us_str_code', 'country'], axis=1)
    
    test_ticket_id = X_test['ticket_id']
    X_test = X_test.drop(['ticket_id', 'inspector_name', 'violator_name', 'violation_street_number', 
                          'violation_street_name', 'violation_zip_code', 'mailing_address_str_number', 
                          'mailing_address_str_number', 'mailing_address_str_name', 'city', 'state', 
                          'zip_code', 'non_us_str_code', 'country'], axis=1)

    # Drop fields with too much variation or null
    X_train = X_train.drop(['violation_code', 'violation_description', 'grafitti_status'], axis=1)
    X_test = X_test.drop(['violation_code', 'violation_description', 'grafitti_status'], axis=1)

    # Drop fields that intuitively colinear with judgment_amount
    X_train = X_train.drop(['fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
                            'clean_up_cost'], axis=1) 
    X_test = X_test.drop(['fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
                          'clean_up_cost'], axis=1) 

    # Time variables convert to duration continuous variable
    tmp1 = pd.to_datetime(X_train['ticket_issued_date'], format='%Y-%m-%d').dt.date 
    tmp2 = pd.to_datetime(X_train['hearing_date'], format='%Y-%m-%d').dt.date
    tmp3 = tmp2 - tmp1
    X_train['duration'] = tmp3.dt.days 
    X_train['duration'] = X_train['duration'].where(X_train['duration']>0, other=0)
    X_train = X_train.drop(['ticket_issued_date', 'hearing_date'], axis=1) 

    tmp1 = pd.to_datetime(X_test['ticket_issued_date'], format='%Y-%m-%d').dt.date 
    tmp2 = pd.to_datetime(X_test['hearing_date'], format='%Y-%m-%d').dt.date
    tmp3 = tmp2 - tmp1
    X_test['duration'] = tmp3.dt.days 
    X_test['duration'] = X_test['duration'].where(X_test['duration']>0, other=0)
    X_test = X_test.drop(['ticket_issued_date', 'hearing_date'], axis=1) 

    # create target variable
    y_train = X_train['compliance']
    X_train = X_train.drop(['compliance'], axis=1)

    # Create categorical variables
    X_train = X_train.rename(columns={'agency_name': 'agency', 'judgment_amount': 'judgment' })
    X_test = X_test.rename(columns={'agency_name': 'agency', 'judgment_amount': 'judgment' })

    cat_columns = ['agency', 'disposition'] 
    df_processed = pd.get_dummies(X_train, prefix_sep="__",
                                  columns=cat_columns)

    cat_dummies = [col for col in df_processed 
                   if "__" in col 
                   and col.split("__")[0] in cat_columns]

    processed_columns = list(df_processed.columns[:])
    df_test_processed = pd.get_dummies(X_test, prefix_sep="__", 
                                       columns=cat_columns)

    # Remove additional columns
    for col in df_test_processed.columns:
        if ("__" in col) and (col.split("__")[0] in cat_columns) and col not in cat_dummies:
            df_test_processed.drop(col, axis=1, inplace=True)

    for col in cat_dummies:
        if col not in df_test_processed.columns:
            df_test_processed[col] = 0

    df_test_processed = df_test_processed[processed_columns]

    
    # Validation step to arrive at the right parameters left here to document model choice
    params = {'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2] , 'max_depth': [2,3,4,5,6]}
    gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(random_state = 0), 
                            param_grid = params, scoring='roc_auc', cv=5)

    #Fit the algorithm on the data
    gsearch3.fit(df_processed, y_train)

    print("The best parameters for this model will be: learning rate",  gsearch3.best_params_['learning_rate'],
          "depth", gsearch3.best_params_['max_depth'])
    print("The AUC for this model was {:.2f}".format(gsearch3.best_score_))

    # Rather than a train test split, we'll to a 5-fold cross validation to evaluate the model
    # The objective set by the assignment was to beat 0.75
    cv_score = cross_validate(gsearch3.best_estimator_, df_processed,
                              y_train, cv=5, scoring='roc_auc')
    print("AUC Cross validated best {:.2f} and worse {:.2f} scores".format(max(cv_score['test_score']), min(cv_score['test_score'])))
    
    y_hat = gsearch3.predict_proba(df_test_processed)[:,1]
    bm = pd.Series(data=y_hat, index=test_ticket_id)
    return "The variable bm was returned for automated grading"


In [2]:
gradient_predictor()

The best parameters for this model will be: learning rate 0.15 depth 4
The AUC for this model was 0.79
AUC Cross validated best 0.84 and worse 0.76 scores


'The variable bm was returned for automated grading'

The grading output was:  
`Your AUC of 0.800840003561 was awarded`