In [1]:
%matplotlib inline

In [61]:
# load libraries & presets
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)

sns.set_theme(style="darkgrid")
sns.set(rc={'figure.figsize':(10, 6)})


In [3]:
# load datasets
train_data = pd.read_csv("train.csv", sep=",", encoding='cp1252')
test_data = pd.read_csv("test.csv", sep=",", encoding='cp1252')
latlong_data = pd.read_csv("latlons.csv", sep=",", encoding='cp1252')
addresses_data = pd.read_csv("addresses.csv", sep=",", encoding='cp1252')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# clean input data
def clean_data(df):
    
    # set index
    df = df.set_index('ticket_id')
    
    # fix misformatted columns (floats to str)
    df['violation_street_number'] = df['violation_street_number'].astype('str')
    df['mailing_address_str_number'] = df['mailing_address_str_number'].astype('str')
    
    # remove columns not available in the test data set
    leaked_cols = df.columns.difference(test_data.columns)
    
    if not leaked_cols.empty:
        leaked_cols = leaked_cols.to_list()
        leaked_cols.remove('compliance')
        df = df.drop(leaked_cols, axis=1)
    
    # remove 'not responsible' rows
    if 'compliance' in df.columns:
        df['compliance'] = df['compliance'].astype('str')
        df = df[df["compliance"] != "nan"]
        
    
    # remove single-level variables (no information)
    df = df.drop(['violation_zip_code', 'grafitti_status', 'clean_up_cost', 'state_fee', 'admin_fee',
                  'non_us_str_code'], axis=1)
    
    # clean & consolidate street address data
    
    # convert address floats to strings
    df['violation_street_number'] = df['violation_street_number'].astype('str')
    df['mailing_address_str_number'] = df['mailing_address_str_number'].astype('str')
    
    # create consolidated columns
    df['mailing_address'] = df['mailing_address_str_number'] + ' ' + df['mailing_address_str_name']
    df['mailing_address'] = df['mailing_address'].str.translate(str.maketrans('', '', string.punctuation))
    df['violation_address'] = df['violation_street_number'] + ' ' + df['violation_street_name']  
    
    # remove old versions
    df = df.drop(['mailing_address_str_number', 'mailing_address_str_name', 'violation_street_number', 'violation_street_name'], axis=1)
    
    # add geolocation data
    geo_address_data = pd.merge(addresses_data, latlong_data, how="left", on="address").set_index("ticket_id")
    df = pd.merge(df, geo_address_data, left_index=True, right_index=True)
    
    # function to reassign dtypes
    def set_dtypes(vars_list, df, dtype):
        for var in vars_list:
            df[var] = df[var].astype(dtype)
        return(df)

    # set datetime variables 
    date_vars = ['ticket_issued_date','hearing_date']
    df = set_dtypes(date_vars, df, 'datetime64')
    
    # get time interval between hearing and issued dates
    df["days_until_hearing"] = (df["hearing_date"] - df["ticket_issued_date"]).dt.days.astype('float64')

    # change all strings to lowercase
    for col in df: 
        if (df[col].dtype.name == 'object'):
            df[col] = df[col].str.lower()
            df[col] = df[col].str.replace("\.0","")
    
    # set category variables
    cat_vars = ['agency_name', 'disposition']
    df = set_dtypes(cat_vars, df, 'category')
    
    # return cleaned data
    return df

# test with both train and test data ... 

train_cleaned = clean_data(train_data)
test_cleaned = clean_data(test_data)

  df[col] = df[col].str.replace("\.0","")


In [5]:
# transform data (fit_transform on training, transform on validation/test)
def transform_data(train_cleaned, test_cleaned): # add test df after
    
    ### IMPUTE MISSING VALUES

    # impute missing values
    df_num = train_cleaned.select_dtypes(np.float64)
    df_cat = train_cleaned.select_dtypes(['category', 'object', 'datetime64']).drop(["compliance"], axis=1)
    print("num_df", df_num.shape)

    print("cat_df", df_cat.shape)

    # train imputer and save fit-transformed data 
    imp_cat = SimpleImputer(strategy="most_frequent") 
    cat_imp = imp_cat.fit_transform(df_cat) # array
    cat_imp_df = pd.DataFrame(cat_imp, columns=df_cat.columns, index=df_cat.index) # df form
        
    # num (use median)
    imp_num = SimpleImputer(strategy="median") 
    num_imp = imp_num.fit_transform(df_num) # array
    num_imp_df = pd.DataFrame(num_imp, columns=df_num.columns, index=df_num.index) # df form

    ### ENCODE CATEGORICAL DATA - SET THOSE TO INCLUDE HERE ...
    
    # choose features to include and set as category ...
    category_features = ["agency_name", "disposition"]
    for feature in category_features: 
        cat_imp_df[feature] = cat_imp_df[feature].astype("category")  
    cat_df = cat_imp_df[category_features] # vars that will be included in the analysis
    
    #print(cat_df.shape)

    # train encoder and save fit-transformed data
    encoder = OneHotEncoder(handle_unknown="ignore")
    feature_arr = encoder.fit_transform(cat_df)
    feature_names = encoder.get_feature_names(category_features)
    df_cat_ohe =  pd.DataFrame(feature_arr.toarray(), columns=feature_names)

    #print(df_cat_ohe.shape)

    ### SCALE NUMERIC VALUES - USE MIN-MAX AS DEFAULT
    
    # train scaler and save fit-transformed data
    scaler = MinMaxScaler()
    scaled_arr = scaler.fit_transform(num_imp_df)
    num_scaled_df = pd.DataFrame(scaled_arr, columns=df_num.columns, index=df_num.index) # df form
    
    #print(num_scaled_df.shape)
    train_tr = pd.merge(num_scaled_df, df_cat_ohe, left_index=True, right_index=True) #.fillna(0)
    # add compliance back in
    train_tr = pd.merge(train_tr, train_cleaned["compliance"], left_index=True, right_index=True)
    
    
    # After fitting all transformers: transform test data using trained imputers & transformers ...
    
    # impute missing values
    test_num = test_cleaned.select_dtypes(np.float64)
    test_cat = test_cleaned.select_dtypes(['category', 'object', 'datetime64'])
    print("test_num", test_num.shape)
    print("test_cat", test_cat.shape)

    # transform test data 
    cat_imp_test = imp_cat.transform(test_cat) # array
    cat_imp_test = pd.DataFrame(cat_imp_test, columns=test_cat.columns, index=test_cat.index) # df form
        
    # num (use median)
    num_imp_test = imp_num.transform(test_num) # array
    num_imp_test = pd.DataFrame(num_imp_test, columns=test_num.columns, index=test_num.index) # df form

    ### ENCODE CATEGORICAL DATA - SET THOSE TO INCLUDE HERE ...
    
    for feature in category_features: 
        cat_imp_test[feature] = cat_imp_test[feature].astype("category")  
    cat_test = cat_imp_test[category_features] # vars that will be included in the analysis
    
    print(cat_test.shape)

    # transform test data
    feature_arr = encoder.transform(cat_test)
    feature_names = encoder.get_feature_names(category_features)
    test_cat_ohe =  pd.DataFrame(feature_arr.toarray(), columns=feature_names, index=test_num.index)

    print(test_cat_ohe.shape)

    ### SCALE NUMERIC VALUES - USE MIN-MAX AS DEFAULT
    
    # transform test data
    scaled_arr = scaler.transform(num_imp_test)
    num_scaled_test = pd.DataFrame(scaled_arr, columns=test_num.columns, index=test_num.index) # df form
    
    print(num_scaled_test.shape)
        
    test_tr = pd.merge(num_scaled_test, test_cat_ohe, left_index=True, right_index=True) #.fillna(0)
    
    return train_tr, test_tr


train_tr, test_tr = transform_data(train_cleaned, test_cleaned)


num_df (159880, 7)
cat_df (159880, 15)
test_num (61001, 7)
test_cat (61001, 15)
(61001, 2)
(61001, 9)
(61001, 7)


In [14]:
# separate X, y from training data

def separate_X_from_y(train_tr, target_var):
    
    y = train_tr[target_var]
    X = train_tr.drop(target_var, axis=1)
    
    return X,y

X, y = separate_X_from_y(train_tr, target_var)

In [15]:
# training_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Fit 3 models with defaults

#### Model 1: logistic regression

#### Model 2: random forest
Random forest feature importance shows that the top 5 most critical features include lat, lon, and days_until_hearing, late_fee, discount_amount. Categorical feature ("disposition" and "agency_name") have no importance, so they should be omitted. See below.  

#### Model 3: gradient boosted

In [75]:
# isolate most important features
most_important_features = ["lat", "lon", "days_until_hearing", "late_fee", "discount_amount"]

X_train = X_train[most_important_features]
X_test = X_test[most_important_features]


In [62]:
# logistic regression
from sklearn.linear_model import LogisticRegression

grid_values = {'C': [0.1, 1, 10]}

# metric to optimize over grid parameters: AUC
model_1 = GridSearchCV(LogisticRegression(), param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)

# print('Cross-validation (AUC)', cross_val_score(model_1, X_train, y_train, cv=5, scoring = 'roc_auc'))

In [67]:
# random forest
from sklearn.ensemble import RandomForestClassifier

grid_values = {'min_samples_split': [2, 4, 10], # default is 2
               'max_features':[2, 3, 4]} # default is none

model_2 = GridSearchCV(RandomForestClassifier(), param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)

# print('Cross-validation (AUC)', cross_val_score(model_2, X_train, y_train, cv=5, scoring = 'roc_auc'))

In [91]:
best_model = RandomForestClassifier(max_features=3, min_samples_split=10).fit(X_train, y_train)

In [70]:
# gradient boosted classifier
from sklearn.ensemble import GradientBoostingClassifier

grid_values = {'learning_rate': [0.005, 0.05, 0.1],
               'n_estimators':[100, 500, 100]}

model_3 = GridSearchCV(GradientBoostingClassifier(), param_grid = grid_values, scoring = 'roc_auc').fit(X_train, y_train)

# print('Cross-validation (AUC)', cross_val_score(model_3, X_train, y_train, cv=5, scoring = 'roc_auc'))

In [96]:
# score the model against validation data
def score_model(model, X_test, y_test):
    # print prediction results
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))

    # calculate AUC score (need over 0.7)
    
    roc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    print("Test AUC_score: ", roc)
    if 'model.best_params_' in locals():

        print('Grid best parameter (max. AUC): ', model.best_params_)
        print('Grid best score (AUC): ', model.best_score_)


In [51]:
# check feature importance
def check_feature_importance(X_train, rdf_model):
    for name, score in zip(X_train.columns, rdf_model.feature_importances_):
        print(name, score)
#check_feature_importance(X_train, model_2)

lat 0.30052794310635933
lon 0.29869786595899633
days_until_hearing 0.16990681552176354
late_fee 0.1324604003878745
discount_amount 0.09840697502500634


In [64]:
score_model(model_1, X_test, y_test)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     19813
           1       0.93      0.11      0.20      1676

    accuracy                           0.93     21489
   macro avg       0.93      0.55      0.58     21489
weighted avg       0.93      0.93      0.90     21489

Test AUC_score:  0.7545247045556143
Grid best parameter (max. AUC):  {'C': 10}
Grid best score (AUC):  0.7646057017190658


In [71]:
score_model(model_2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97     19813
           1       0.72      0.27      0.40      1676

    accuracy                           0.94     21489
   macro avg       0.83      0.63      0.68     21489
weighted avg       0.92      0.94      0.92     21489

Test AUC_score:  0.8127172686335614
Grid best parameter (max. AUC):  {'max_features': 3, 'min_samples_split': 10}
Grid best score (AUC):  0.8074591435616666


In [97]:
score_model(best_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97     19813
           1       0.72      0.28      0.40      1676

    accuracy                           0.94     21489
   macro avg       0.83      0.63      0.68     21489
weighted avg       0.92      0.94      0.92     21489

Test AUC_score:  0.8105738234834605


In [72]:
score_model(model_3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     19813
           1       0.82      0.19      0.30      1676

    accuracy                           0.93     21489
   macro avg       0.88      0.59      0.63     21489
weighted avg       0.93      0.93      0.91     21489

Test AUC_score:  0.7953095030419868
Grid best parameter (max. AUC):  {'learning_rate': 0.1, 'n_estimators': 500}
Grid best score (AUC):  0.8017505902569967


In [85]:
# generate the final answer with the test data ...
# create FINAL test set for post-model selection answer
final_X = test_tr[most_important_features]

best_model = model_2 # so far

In [98]:
final_X["compliance"] = [i[1] for i in best_model.predict_proba(final_X)]
answer = final_X["compliance"].astype('float32')

In [99]:
answer

ticket_id
284932    0.015095
285362    0.110369
285361    0.005179
285338    0.198878
285346    0.052675
            ...   
376496    0.046596
376497    0.046596
376499    0.018380
376500    0.018380
369851    0.327579
Name: compliance, Length: 61001, dtype: float32