In [27]:
import csv
import gzip
import pandas as pd
import numpy as np
###
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler

In [2]:
def get_data(fn):
    return pd.read_csv(fn, header=0)

In [3]:
def timing(time):
    time = time.split(':')
    time = int(time[0]) * 60 + int(time[1])
    return time

In [4]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
districts = ['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
labels = 'ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS'.split(',')

In [5]:
raw_train = get_data('data/train.csv')

In [6]:
raw_train['Street']=[[w for w in row.split() if w.isupper() and len(w)>2] for row in raw_train['Address']]
raw_train['Days']=pd.Categorical.from_array(raw_train['DayOfWeek']).codes
raw_train['Districts']=pd.Categorical.from_array(raw_train['PdDistrict']).codes
raw_train['Labels']=pd.Categorical.from_array(raw_train['Category']).codes

In [7]:
Dates = pd.DataFrame(raw_train['Dates'].str.split(' ',1).tolist(),columns = ['Date','Time'])

In [8]:
Date = pd.DataFrame(Dates['Date'].str.split('/').tolist(),columns = ['Month','Date','Year'])

In [9]:
Dates['Time'] = Dates['Time'].apply(timing)

In [10]:
raw_train = pd.concat([raw_train,Date], axis =1)

In [11]:
raw_train['Time']=Dates['Time']

In [12]:
X = raw_train.drop(['Dates','Category','Descript','DayOfWeek','PdDistrict','Resolution','Address','Street','Labels'],axis=1)

In [13]:
y = raw_train.ix[:,'Labels']

In [14]:
Y = np_utils.to_categorical(y)

In [15]:
input_dim = X.shape[1]
output_dim = len(labels)
print('Input dimensions: {}'.format(input_dim))
print('Output dimensions: {}'.format(output_dim))

Input dimensions: 8
Output dimensions: 39


In [16]:
def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1):
    model = Sequential()
    model.add(Dense(input_dim, hn, init='glorot_uniform'))
    model.add(PReLU((hn,)))
    model.add(Dropout(dp))

    for i in range(layers):
        model.add(Dense(hn, hn, init='glorot_uniform'))
        model.add(PReLU((hn,)))
        model.add(BatchNormalization((hn,)))
        model.add(Dropout(dp))

    model.add(Dense(hn, output_dim, init='glorot_uniform'))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [17]:
EPOCHS = 1
BATCHES = 128
HN = 64
RUN_FOLDS = False
nb_folds = 4
kfolds = KFold(len(y), nb_folds)
av_ll = 0.
f = 0
if RUN_FOLDS:
      for train, valid in kfolds:
            print('---' * 20)
            print('Fold', f)
            print('---' * 20)
            f += 1
            X_train = X[train]
            X_valid = X[valid]
            Y_train = Y[train]
            Y_valid = Y[valid]
            y_valid = y[valid]

            print("Building model...")
            model = build_model(input_dim, output_dim, HN)

            print("Training model...")

            model.fit(X_train, Y_train, nb_epoch=EPOCHS, batch_size=BATCHES, validation_data=(X_valid, Y_valid), verbose=0)
            valid_preds = model.predict_proba(X_valid)
            ll = metrics.log_loss(y_valid, valid_preds)
            print("LL:", ll)
            av_ll += ll
            print('Average LL:', av_ll / nb_folds)

In [18]:
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
    return X, scaler
X, scaler = preprocess_data(X)

In [19]:
print("Generating submission...")

model = build_model(input_dim, output_dim, HN)
model.fit(X, Y, nb_epoch=EPOCHS, batch_size=BATCHES, verbose=0)

Generating submission...


<keras.callbacks.History at 0x1132edb50>

In [20]:
raw_test = get_data('data/test.csv')

In [21]:
raw_test['Days']=pd.Categorical.from_array(raw_test['DayOfWeek']).codes
raw_test['Districts']=pd.Categorical.from_array(raw_test['PdDistrict']).codes

Dates = pd.DataFrame(raw_test['Dates'].str.split(' ',1).tolist(),columns = ['Date','Time'])
Date = pd.DataFrame(Dates['Date'].str.split('/').tolist(),columns = ['Month','Date','Year'])
Dates['Time'] = Dates['Time'].apply(timing)
raw_test = pd.concat([raw_test,Date], axis =1)
raw_test['Time']=Dates['Time']

In [22]:
test_X = raw_test.drop(['Id','Dates','DayOfWeek','PdDistrict','Address'],axis=1)

In [25]:
test_X, scaler = preprocess_data(test_X)

In [28]:
print('Predicting over testing data...')
preds = model.predict_proba(test_X, verbose=0)

with gzip.open('sf-nn.csv.gz', 'wt') as outf:
  fo = csv.writer(outf, lineterminator='\n')
  fo.writerow(['Id'] + labels)

  for i, pred in enumerate(preds):
    fo.writerow([i] + list(pred))

Predicting over testing data...
