# Understanding and Predicting Property Maintenance Fines

<b>Task</b> Understanding the factors which influence whether or not a city resident complies with a blight ticket. Develop a model which predicts which tickets will be paid on time.

<b>Model</b> Gradient Boosted Decision Tree

<b>Data</b>  [Detroit Open Data Portal](https://data.detroitmi.gov/). 

* [Building Permits](https://data.detroitmi.gov/Property-Parcels/Building-Permits/xw2a-a7tf)
* [Trades Permits](https://data.detroitmi.gov/Property-Parcels/Trades-Permits/635b-dsgv)
* [Improve Detroit: Submitted Issues](https://data.detroitmi.gov/Government/Improve-Detroit-Submitted-Issues/fwz3-w3yn)
* [DPD: Citizen Complaints](https://data.detroitmi.gov/Public-Safety/DPD-Citizen-Complaints-2016/kahe-efs3)
* [Parcel Map](https://data.detroitmi.gov/Property-Parcels/Parcel-Map/fxkw-udwf)


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from matplotlib.colors import ListedColormap
from sklearn.model_selection import cross_val_score


In [17]:
def readTrainCSV():
    #no of rows 159880
    cols = ['ticket_id', 'agency_name', 'inspector_name', 'violator_name',
       'mailing_address_str_number', 'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'discount_amount', 'clean_up_cost', 'judgment_amount',
        'grafitti_status', 'compliance']
    
    train_data = pd.read_csv("train.csv", encoding = 'ISO-8859-1', 
                             usecols=cols, index_col=0, low_memory=False)
    
    train_data = train_data.dropna(subset=['compliance'])
    
    return train_data

#readTrainCSV()

In [13]:
def readAddCSV():
    # No NaN addresses
    address_data = pd.read_csv("addresses.csv", index_col=0)
    
    return address_data

#readAddCSV()

In [19]:
def readLatLonCSV():
    # no of rows 121769
    latlon_data = pd.read_csv("latlons.csv")
    latlon_data = latlon_data.fillna(0)
    
    return latlon_data

#readLatLonCSV()

In [21]:
def readValidationTestCSV():
    # no of rows 61001
    val_data = pd.read_csv("test.csv", index_col=0)
    
    return val_data

#readValidationTestCSV()

In [23]:
def getRawTrain():
    latlon_data =  readLatLonCSV()
    address_data = readAddCSV()
    train_data =  readTrainCSV()
    
    train_df = pd.merge(train_data,
                        pd.merge(address_data, latlon_data, how='inner', left_on='address', right_on='address'),
                        how="left",left_index=True, right_index=True)
    
    
    return train_df

#getRawTrain()

In [25]:
def getRawValidation():
    latlon_data =  readLatLonCSV()
    address_data = readAddCSV()
    val_data =  readValidationTestCSV()
    
    val_df = pd.merge(val_data,
                        pd.merge(address_data, latlon_data, how='inner', left_on='address', right_on='address'),
                        how="left",left_index=True, right_index=True)
    
    
    return val_df

#getRawValidation()

In [31]:
def getTrainData():
    feature_cols = ['discount_amount','judgment_amount', 'lat', 'lon']
    
    train_df = getRawTrain()
    
    y_blight = train_df.compliance
    X_blight = train_df[feature_cols] 
    #X_blight.mailing_address_str_number = X_blight['mailing_address_str_number'].fillna(0)
    
    return X_blight, y_blight, feature_cols 

#X_blight, y_blight, feature_cols = getTrainData()


In [37]:
def getValidationData():
    feature_cols = ['discount_amount', 'judgment_amount', 'lat', 'lon']
    
    val_df = getRawValidation()

    X_val = val_df[feature_cols] 
    X_val.lat = X_val.lat.fillna(0)
    X_val.lon = X_val.lon.fillna(0)
    
    val_index = X_val.index
    
    return X_val, val_index

X_val, val_index=getValidationData()

In [41]:
def blight_model():
    X_blight, y_blight, features = getTrainData()
    X_val, val_index = getValidationData()
    X_train, X_test, y_train, y_test = train_test_split(X_blight, y_blight, random_state=100)
    
    learning_rate = [0.01, 0.05, 0.1]
    param_grid = dict(learning_rate=learning_rate)                                         
    clf = GradientBoostingClassifier(random_state=100)
    #print ("Feature Importances")
    #print (list(zip(features, clf.feature_importances_)))
    grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='roc_auc')
    grid_result = grid_search.fit(X_train, y_train)    
                                                        
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    #clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.1, random_state=0).fit(X_train, y_train)
    
    y_test_predict = grid_result.predict(X_test)
    y_train_predict = grid_result.predict(X_train)
    
    print ("Training data accuracy: {:.2f}".format(grid_result.score(X_train, y_train)))
    print ("Test data accuracy: {:.2f}".format(grid_result.score(X_test, y_test)))
    

    
    print ("AUC score test data: {:.2f}".format(roc_auc_score(y_test, y_test_predict)))
    print ("AUC score train data: {:.2f}".format(roc_auc_score(y_train, y_train_predict)))
    
    probas = grid_result.predict_proba(X_val)[:,1]
    
    output_probas = pd.Series(data=probas, index=val_index)
    
    return output_probas
   
blight_model()

Best: 0.754097 using {'learning_rate': 0.1}
0.721307 (0.006391) with: {'learning_rate': 0.01}
0.749728 (0.005001) with: {'learning_rate': 0.05}
0.754097 (0.006891) with: {'learning_rate': 0.1}
Training data accuracy: 0.76
Test data accuracy: 0.76
AUC score test data: 0.56
AUC score train data: 0.56


ticket_id
284932    0.063088
285362    0.033397
285361    0.078664
285338    0.063088
285346    0.075961
285345    0.063088
285347    0.062150
285342    0.313847
285530    0.031142
284989    0.037376
285344    0.060390
285343    0.032464
285340    0.032235
285341    0.057015
285349    0.078664
285348    0.065366
284991    0.035663
285532    0.036335
285406    0.039537
285001    0.039537
285006    0.031861
285405    0.033397
285337    0.035871
285496    0.062150
285497    0.065366
285378    0.032235
285589    0.036080
285585    0.063088
285501    0.076475
285581    0.032235
            ...   
376367    0.055206
376366    0.061046
376362    0.061046
376363    0.104807
376365    0.055206
376364    0.061046
376228    0.061046
376265    0.061046
376286    0.474135
376320    0.061046
376314    0.061046
376327    0.474135
376385    0.474135
376435    0.192936
376370    0.474135
376434    0.059118
376459    0.289654
376478    0.024602
376473    0.061046
376484    0.051256
376482    0.053384
37