In [None]:
import pandas as pd
train_data = pd.read_csv('kaggle_crimes/train.csv')[:10000]
# test_data = pd.read_csv('kaggle_crimes/test.csv')

In [311]:
# adding features
import re

def get_street_name(address):
    street_re = re.compile(r'([A-Z]+)\sST')
    street = street_re.search(address)
    if street:
        return street.group(1)
    return ''

def get_crime_inside(address):
    block_re = re.compile(r'^\d+ Block of')
    if block_re.search(address):
        return 1
    return 0

def get_season(month):
    seasons_dict = {1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 2, 7: 2, 8: 2, 9: 3, 10: 3, 11: 3, 12: 0}
    return seasons_dict[month]

def get_day_time(hour):
    if hour < 5:
        return 0
    if hour < 12:
        return 1
    if hour < 19:
        return 2
    if hour < 23:
        return 3
    return 0

train_data['Dates'] =  pd.to_datetime(train_data['Dates'], format='%Y-%m-%d %H:%M:%S')
train_data['Month'] = train_data.Dates.apply(lambda x: x.month)
train_data['Hour'] = train_data.Dates.apply(lambda x: x.hour)
train_data['Day'] = train_data.Dates.apply(lambda x: x.day)
train_data['Season'] = train_data.Month.apply(lambda x: get_season(x))
train_data['DayTime'] = train_data.Hour.apply(lambda x: get_day_time(x))
weekend = ['Saturday', 'Sunday']
train_data['Weekend'] = train_data.DayOfWeek.apply(lambda x: int(x in weekend))
train_data['Street'] = train_data.Address.apply(lambda x: get_street_name(x))
train_data['Inside'] = train_data.Address.apply(lambda x: get_crime_inside(x))



In [312]:
# encoding string values
from sklearn import preprocessing

categories_labels =  preprocessing.LabelEncoder()
columns_labels = preprocessing.LabelEncoder()
train_columns_to_encode = ['Descript', 'PdDistrict', 'Resolution', 'DayOfWeek', 'Street']

for col in train_columns_to_encode:
    train_data[col] = columns_labels.fit_transform(train_data[col])
    
train_data.Category = categories_labels.fit_transform(train_data.Category)

In [313]:
# #plot localization and categories of crimes
# from matplotlib import pyplot as plt
# plt.scatter(train_data.X, train_data.Y, c=train_data.Category)
# plt.show()

In [314]:
# features selection

# from sklearn.feature_selection import SelectKBest, f_classif

predictors = ["X", "Y", "Inside", "DayTime", "PdDistrict"]
# selector = SelectKBest(f_classif, k=7)
# selector.fit(train_data[predictors], train_data.Category)
selected_features_train = train_data[predictors].as_matrix()
# print(selector.scores_)

In [315]:
# cross-validation and classification

from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier,  OutputCodeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

import numpy as np

# classifier = Classifier(
#     layers=[
#         Layer("Maxout", units=100, pieces=2),
#         Layer("Softmax")],
#     learning_rate=0.001,
#     n_iter=25)


kf = KFold(train_data.shape[0], n_folds=3, random_state=1)
classifier = KNeighborsClassifier(n_neighbors=50)
# classifier = OneVsRestClassifier(LinearSVC(random_state=0))
# classifier = SVC()
predictions = []

for train, test in kf:
    train_predictors = selected_features_train[train,:]
    train_target = train_data.Category.iloc[train]
    classifier.fit(train_predictors, train_target)
    test_predictions = classifier.predict(selected_features_train[test,:])
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)


In [316]:
# calculate error
import scipy as sp
import math

def class_to_vect(classification):
    cls_count = len(train_data.Category.unique())
    vect = []
    for cls in classification:
        v = [0] * cls_count
        v[cls] = 1
        vect.append(v)
    return vect

def log_loss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = sum(ll)
    ll = ll * -1.0/len(act)
    return ll

def llfun(act, pred):
    """ Logloss function for 1/0 probability
    """
    return (-(~(act == pred)) * math.log(1e-15)).sum() / len(act)

log_loss_classification = log_loss(class_to_vect(train_data.Category), class_to_vect(predictions))
print(log_loss_classification)

49.5360864969


In [300]:
# predict test set

# add features
test_data['Dates'] =  pd.to_datetime(test_data['Dates'], format='%Y-%m-%d %H:%M:%S')
test_data['Month'] = test_data.Dates.apply(lambda x: x.month)
test_data['Hour'] = test_data.Dates.apply(lambda x: x.hour)
weekend = ['Saturday', 'Sunday']
test_data['Weekend'] = test_data.DayOfWeek.apply(lambda x: int(x in weekend))
test_data['Street'] = test_data.Address.apply(lambda x: get_street_name(x))
test_data['Day'] = test_data.Dates.apply(lambda x: x.day)
test_data['Season'] = test_data.Month.apply(lambda x: get_season(x))
test_data['DayTime'] = test_data.Hour.apply(lambda x: get_day_time(x))
test_data['Inside'] = test_data.Address.apply(lambda x: get_crime_inside(x))


# encode columns values
test_columns_to_encode = ['PdDistrict', 'DayOfWeek', 'Street']

for col in test_columns_to_encode:
    test_data[col] = columns_labels.fit_transform(test_data[col])

# select best features
selected_features_test = test_data[predictors].as_matrix() # selector.transform(test_data[predictors])

# classify
classifier.fit(selected_features_train, train_data.Category)
test_classification = classifier.predict(selected_features_test)




In [301]:
# save result
classification_results = pd.DataFrame(class_to_vect(test_classification), columns=categories_labels.classes_)
classification_results.to_csv('kaggle_crimes/submission.csv', index_label='Id')