In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

# Packing everything into one function

In [3]:
def blight_model():
    #################### DATA PROCESSING ####################
    # Load dataframes
    train = pd.read_csv("data/train.csv", index_col = 0)
    test = pd.read_csv("data/test.csv", index_col = 0)

    # Get the ids to differentiate them after joining
    train_ids = list(train.index)
    test_ids = list(test.index)

    # Join dataframes vertically
    full = pd.concat([train, test], axis = 0)

    # Variables to keep
    to_keep = ["ticket_issued_date", "hearing_date", "violation_code", "judgment_amount", "compliance"]
    full = full.loc[:, to_keep]

    # Datetime variables processing
    full.ticket_issued_date = pd.to_datetime(full.ticket_issued_date)
    full.hearing_date = pd.to_datetime(full.hearing_date)

    full["hearing_month"] = full.hearing_date.dt.month
    full["ticket_issued_month"] = full.ticket_issued_date.dt.month

    full.hearing_month = full.hearing_month.fillna(0)

    full = full.drop(["hearing_date", "ticket_issued_date"], axis = 1)

    # Encoding object variables
    from sklearn.preprocessing import LabelEncoder
    full.violation_code = LabelEncoder().fit_transform(full.violation_code)
    
    # I split back the full dataframe into train and test
    # I add dropna to remove those rows where compliance is null, meaning that the subject
    # wasn't responsible after all
    train2 = full.loc[train_ids, :].dropna()

    # From test I remove compliance as I don't have the info for those rows
    # In fact, it is what I'm trying to predict
    test2 = full.loc[test_ids, :].drop("compliance", axis = 1)

    #################### MODEL SELECTION AND TRAINING ####################
    # Libraries
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.ensemble import RandomForestClassifier

    #### 1) General parameters
    seed = 42
    scaler = MinMaxScaler()

    #### 2) Train-test split
    X = train2.drop("compliance", axis = 1)
    X = scaler.fit_transform(X)
    y = train2.compliance

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = seed)

    #### 3) Model selection, training and predictions
    model = RandomForestClassifier(n_jobs = -1, random_state = seed)

    model.fit(X_train, y_train)

    predictions = model.predict_proba(test2)
    # predict_proba outputs 2 probabilities: of being negative and of being positive
    # As I only need the second one, I do some further processing
    predictions = [prediction[1] for prediction in predictions]

    predictions = pd.Series(predictions, index = test_ids)
    predictions.name = "compliance"
    predictions.rename_axis("ticket_id", inplace = True)

    return predictions

blight_model().shape

(61001,)