In [5]:
import pandas as pd
import numpy as np


### MAKE SURE TO SWITCH DATA SET URLS BEFORE SUBMITTING 

def blight_model():
    # load libraries & presets
    import warnings
    warnings.filterwarnings('ignore')
    
    # Your code here
    import string

    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import MinMaxScaler

    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import GridSearchCV

    from sklearn.metrics import classification_report
    from sklearn.metrics import roc_auc_score
    
    #load datasets
    train_data = pd.read_csv("train.csv", sep=",", encoding='cp1252')
    test_data = pd.read_csv("test.csv", sep=",", encoding='cp1252')
    latlong_data = pd.read_csv("latlons.csv", sep=",", encoding='cp1252')
    addresses_data = pd.read_csv("addresses.csv", sep=",", encoding='cp1252')
    
    # clean input data
    def clean_data(df):

        # set index
        df = df.set_index('ticket_id')

        # fix misformatted columns (floats to str)
        df['violation_street_number'] = df['violation_street_number'].astype('str')
        df['mailing_address_str_number'] = df['mailing_address_str_number'].astype('str')

        # remove columns not available in the test data set
        leaked_cols = df.columns.difference(test_data.columns)

        if len(leaked_cols) > 0:
            leaked_cols = leaked_cols.tolist()
            leaked_cols.remove('compliance')
            df = df.drop(leaked_cols, axis=1)

        # remove 'not responsible' rows
        if 'compliance' in df.columns:
            df['compliance'] = df['compliance'].astype('str')
            df = df[df["compliance"] != "nan"]


        # remove single-level variables (no information)
        df = df.drop(['violation_zip_code', 'grafitti_status', 'clean_up_cost', 'state_fee', 'admin_fee',
                      'non_us_str_code'], axis=1)

        # clean & consolidate street address data

        # convert address floats to strings
        df['violation_street_number'] = df['violation_street_number'].astype('str')
        df['mailing_address_str_number'] = df['mailing_address_str_number'].astype('str')

        # create consolidated columns
        df['mailing_address'] = df['mailing_address_str_number'] + ' ' + df['mailing_address_str_name']
        df['mailing_address'] = df['mailing_address'].str.translate(str.maketrans('', '', string.punctuation))
        df['violation_address'] = df['violation_street_number'] + ' ' + df['violation_street_name']  

        # remove old versions
        df = df.drop(['mailing_address_str_number', 'mailing_address_str_name', 'violation_street_number', 'violation_street_name'], axis=1)

        # add geolocation data
        geo_address_data = pd.merge(addresses_data, latlong_data, how="left", on="address").set_index("ticket_id")
        df = pd.merge(df, geo_address_data, left_index=True, right_index=True)

        # function to reassign dtypes
        def set_dtypes(vars_list, df, dtype):
            for var in vars_list:
                df[var] = df[var].astype(dtype)
            return(df)

        # set datetime variables 
        date_vars = ['ticket_issued_date','hearing_date']
        df = set_dtypes(date_vars, df, 'datetime64')

        # get time interval between hearing and issued dates
        df["days_until_hearing"] = (df["hearing_date"] - df["ticket_issued_date"]).dt.days.astype('float64')

        # change all strings to lowercase
        for col in df: 
            if (df[col].dtype.name == 'object'):
                df[col] = df[col].str.lower()
                df[col] = df[col].str.replace("\.0","")

        # set category variables
        cat_vars = ['agency_name', 'disposition']
        df = set_dtypes(cat_vars, df, 'category')

        # return cleaned data
        return df

    # transform data (fit_transform on training, transform on validation/test)
    def transform_data(train_cleaned, test_cleaned): # add test df after

        ### IMPUTE MISSING VALUES

        # impute missing values
        df_num = train_cleaned.select_dtypes(np.float64)
        df_cat = train_cleaned.select_dtypes(['category', 'object', 'datetime64']).drop(["compliance"], axis=1)
        #print("num_df", df_num.shape)

        #print("cat_df", df_cat.shape)

        # train imputer and save fit-transformed data 
        imp_cat = SimpleImputer(strategy="most_frequent") 
        cat_imp = imp_cat.fit_transform(df_cat) # array
        cat_imp_df = pd.DataFrame(cat_imp, columns=df_cat.columns, index=df_cat.index) # df form

        # num (use median)
        imp_num = SimpleImputer(strategy="median") 
        num_imp = imp_num.fit_transform(df_num) # array
        num_imp_df = pd.DataFrame(num_imp, columns=df_num.columns, index=df_num.index) # df form

        ### ENCODE CATEGORICAL DATA - SET THOSE TO INCLUDE HERE ...

        # choose features to include and set as category ...
        category_features = ["agency_name", "disposition"]
        for feature in category_features: 
            cat_imp_df[feature] = cat_imp_df[feature].astype("category")  
        cat_df = cat_imp_df[category_features] # vars that will be included in the analysis

        #print(cat_df.shape)

        # train encoder and save fit-transformed data
        encoder = OneHotEncoder(handle_unknown="ignore")
        feature_arr = encoder.fit_transform(cat_df)
        feature_names = encoder.get_feature_names(category_features)
        df_cat_ohe =  pd.DataFrame(feature_arr.toarray(), columns=feature_names)

        #print(df_cat_ohe.shape)

        ### SCALE NUMERIC VALUES - USE MIN-MAX AS DEFAULT

        # train scaler and save fit-transformed data
        scaler = MinMaxScaler()
        scaled_arr = scaler.fit_transform(num_imp_df)
        num_scaled_df = pd.DataFrame(scaled_arr, columns=df_num.columns, index=df_num.index) # df form

        #print(num_scaled_df.shape)
        train_tr = pd.merge(num_scaled_df, df_cat_ohe, left_index=True, right_index=True) #.fillna(0)
        # add compliance back in
        train_tr = pd.merge(train_tr, train_cleaned["compliance"], left_index=True, right_index=True)


        # After fitting all transformers: transform test data using trained imputers & transformers ...

        # impute missing values
        test_num = test_cleaned.select_dtypes(np.float64)
        test_cat = test_cleaned.select_dtypes(['category', 'object', 'datetime64'])
        #print("test_num", test_num.shape)
        #print("test_cat", test_cat.shape)

        # transform test data 
        cat_imp_test = imp_cat.transform(test_cat) # array
        cat_imp_test = pd.DataFrame(cat_imp_test, columns=test_cat.columns, index=test_cat.index) # df form

        # num (use median)
        num_imp_test = imp_num.transform(test_num) # array
        num_imp_test = pd.DataFrame(num_imp_test, columns=test_num.columns, index=test_num.index) # df form

        ### ENCODE CATEGORICAL DATA - SET THOSE TO INCLUDE HERE ...

        for feature in category_features: 
            cat_imp_test[feature] = cat_imp_test[feature].astype("category")  
        cat_test = cat_imp_test[category_features] # vars that will be included in the analysis

        #print(cat_test.shape)

        # transform test data
        feature_arr = encoder.transform(cat_test)
        feature_names = encoder.get_feature_names(category_features)
        test_cat_ohe =  pd.DataFrame(feature_arr.toarray(), columns=feature_names, index=test_num.index)

        #print(test_cat_ohe.shape)

        ### SCALE NUMERIC VALUES - USE MIN-MAX AS DEFAULT

        # transform test data
        scaled_arr = scaler.transform(num_imp_test)
        num_scaled_test = pd.DataFrame(scaled_arr, columns=test_num.columns, index=test_num.index) # df form

        #print(num_scaled_test.shape)

        test_tr = pd.merge(num_scaled_test, test_cat_ohe, left_index=True, right_index=True) #.fillna(0)

        return train_tr, test_tr

    def separate_X_from_y(train_tr, target_var):

        y = train_tr[target_var]
        X = train_tr.drop(target_var, axis=1)

        return X,y
    
    # test with both train and test data ... 
    train_cleaned = clean_data(train_data)
    test_cleaned = clean_data(test_data)
    train_tr, test_tr = transform_data(train_cleaned, test_cleaned)
    
    # separate X, y from training data
    X, y = separate_X_from_y(train_tr, "compliance")

    # training_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    # isolate most important features
    most_important_features = ["lat", "lon", "days_until_hearing", "late_fee", "discount_amount"]
    X_train = X_train[most_important_features]
    X_test = X_test[most_important_features]
    
    # run optimal model based on GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    best_model = RandomForestClassifier(max_features=3, min_samples_split=10).fit(X_train, y_train)
    
    # get the final set of predictions based on the test set
    final_X = test_tr[most_important_features]
    final_X["compliance"] = [i[1] for i in best_model.predict_proba(final_X)]
    answer = final_X["compliance"].astype('float32')
    
    return answer # Your answer here

In [6]:
blight_model()

ticket_id
284932    0.026361
285362    0.139672
285361    0.012650
285338    0.174310
285346    0.053102
            ...   
376496    0.007123
376497    0.007123
376499    0.012500
376500    0.012500
369851    0.294283
Name: compliance, Length: 61001, dtype: float32