 [Detroit Open Data Portal](https://data.detroitmi.gov/)

    ticket_id - unique identifier for tickets
    agency_name - Agency that issued the ticket
    inspector_name - Name of inspector that issued the ticket
    violator_name - Name of the person/organization that the ticket was issued to
    violation_street_number, violation_street_name, violation_zip_code - Address where the violation occurred
    mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country - Mailing address of the violator
    ticket_issued_date - Date and time the ticket was issued
    hearing_date - Date and time the violator's hearing was scheduled
    violation_code, violation_description - Type of violation
    disposition - Judgment and judgement type
    fine_amount - Violation fine amount, excluding fees
    admin_fee - $20 fee assigned to responsible judgments
state_fee - $10 fee assigned to responsible judgments
    late_fee - 10% fee assigned to responsible judgments
    discount_amount - discount applied, if any
    clean_up_cost - DPW clean-up or graffiti removal cost
    judgment_amount - Sum of all fines and fees
    grafitti_status - Flag for graffiti violations
    
train.csv only

    payment_amount - Amount paid, if any
    payment_date - Date payment was made, if it was received
    payment_status - Current payment status as of Feb 1 2017
    balance_due - Fines and fees still owed
    collection_status - Flag for payments in collections
    compliance [target variable for prediction] 
     Null = Not responsible
     0 = Responsible, non-compliant
     1 = Responsible, compliant
    compliance_detail - More information on why each ticket was marked compliant or non-compliant



In [1]:
import pandas as pd
import numpy as np
#https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
#https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe
#https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig


def blight_model():
    from sklearn.ensemble import RandomForestClassifier
    from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
    
    #train_df = pd.read_csv('readonly/train.csv', engine='python')
    #test_df = pd.read_csv('readonly/test.csv', engine='python')
    
    train_df = pd.read_csv('train.csv', engine='python')
    test_df = pd.read_csv('test.csv', engine='python')
    
    train_df = train_df.dropna(subset=['compliance'])
      
    y_train = train_df['compliance']
   
    #Set up train info.
    X_train = train_df[['violation_code','fine_amount','discount_amount',
                       'judgment_amount']].copy()
    
    X_train['violation_code'] = X_train['violation_code'].map(lambda x: x.rstrip(')dD(aAbBcC'))
    
    X_train = pd.get_dummies(X_train,prefix=['violation_code'], 
                           columns = ['violation_code'], drop_first=True)
   
    #Set up test info.
    X_test = test_df[['violation_code','fine_amount','discount_amount',
                       'judgment_amount']].copy()
    
    X_test['violation_code'] = X_test['violation_code'].map(lambda x: x.rstrip(')dD(aAbBcC'))
    
    X_test = pd.get_dummies(X_test,prefix=['violation_code'], 
                           columns = ['violation_code'], drop_first=True)
    
    #Eliminate violation code columns that don't exist in train data
    empty_features = [element for element in list(X_train) if element not in list(X_test)] 
    X_train = X_train.drop(empty_features, axis=1)
    X_test = X_test[list(X_train)] 
    
    clf = RandomForestClassifier().fit(X_train, y_train)
    
    df = pd.DataFrame(clf.predict_proba(X_test), index = test_df['ticket_id'] )
    df = df.drop([0], axis=1)
    
    my_series = df.squeeze()

    
    return my_series