In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

# Reading preprocessed data
incidents = pd.read_csv("../sf-police-calls-for-service-and-incidents/label_encoded_xy.csv")
print(incidents.head(10))

   Category  DayOfWeek  Time  PdDistrict  Resolution           X          Y  \
0        36          5   630           7          11 -122.411912  37.775207   
1        20          5   254           7          11 -122.419258  37.775146   
2        21          5   121           3           0 -122.417813  37.757101   
3        21          5   121           3           0 -122.417813  37.757101   
4        21          5   121           3           0 -122.417813  37.757101   
5        21          5    87           3          11 -122.415617  37.756414   
6        25          5    85           7           0 -122.410042  37.781954   
7         7          5    85           7           0 -122.410042  37.781954   
8        17          5    19           5           0 -122.447761  37.769846   
9        37          5    19           5           0 -122.447761  37.769846   

   Day  Month  
0   15      5  
1   15      5  
2   15      5  
3   15      5  
4   15      5  
5   15      5  
6   15      5  
7 

In [4]:
# There's no point in using resolution,
# since we can't have it when predicting
incidents.pop('Resolution')

# Separating target
y = incidents.pop('Category')
y.head(10)

0    36
1    20
2    21
3    21
4    21
5    21
6    25
7     7
8    17
9    37
Name: Category, dtype: int64

In [5]:
# Splitting into training and testing data
X_train, X_test, y_train, y_test = train_test_split(incidents, y, test_size=0.2, random_state=42)

Training and testing data must be normalized and standardized separately to avoid peeking.

In [10]:
# Normalize
#normalizer = Normalizer()
#X_train = normalizer.fit_transform(X_train)
#X_test = normalizer.fit_transform(X_test)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Gradient boosting with XGBoost

In [6]:
# Randomly assign parameters within typical range
params = {
    'booster': 'gbtree',
    'verbosity': 0,
    'eta': np.random.uniform(.05,.2),
    'min_child_weight': np.random.randint(3,7),
    'max_depth': np.random.randint(3,10),
    'subsample': np.random.uniform(.5,1),
    'colsample_bytree': np.random.uniform(.5,1),
    'objective': 'multi:softprob',
    'eval_metric': 'rmse'
}

xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8863841517833149, eta=0.15507709650135765,
       eval_metric='rmse', gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=6, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9208624213752619, verbosity=0)

In [7]:
y_pred = xgb.predict(X_test)

In [8]:
accuracy_score(y_test, y_pred)

0.3402715544971276

Out of sample accuracy of 34%.

In [9]:
pred_prob = xgb.predict_proba(X_test)

In [10]:
log_loss(y_test.values, pred_prob)

2.171404348096407

Out of sample logarithmic loss of 2.17.

### Random Forest

In [None]:
# Tried with Grid search cross validation, got memory leak

#from sklearn.model_selection import GridSearchCV
# Set the parameters by cross-validation
#tuned_parameters = [{'n_estimators': [200], 'min_samples_leaf': [1, 10],
                     #'random_state': [47], 'n_jobs': [-1]}]
#clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='neg_log_loss')

In [11]:
# The more estimaters, the more accurate the result,
# the more min_samples_leaf, the more robust to noise.
clf = RandomForestClassifier(n_estimators=200, min_samples_leaf=50,
                            random_state=31, n_jobs = -1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=31, verbose=0, warm_start=False)

In [7]:
accuracy_score(y_test, clf.predict(X_test))

0.2924775115405018

Slightly less accuraty than boosting, 29%.

In [8]:
pred_prob_rf = clf.predict_proba(X_test)

In [9]:
from sklearn.metrics import log_loss
log_loss(y_test.values, pred_prob_rf)

2.3663331080633467

More loss than boosting, 2.37.