# SanFrancisco Crime Classification Problem  

This problem is specified well in [this link](https://www.kaggle.com/c/sf-crime)  
This notebook is a solution for the problem.

### Setting the environment for the experiment

In [1]:
# It makes matplotlib be able to show plots in line.
%matplotlib inline

In [2]:
# Import needed modules.
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

### Read data from the given files

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

### Setting features to use

In [4]:
feature_cols = ["X", "Y"]
label_col = "Category"

### Setting and Learning the model

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(n_jobs=8)
model.fit(train_data[feature_cols], train_data[label_col])
prediction = model.predict_proba(test_data[feature_cols])

In [6]:
labels = sorted(train_data[label_col].unique())

### Making a data frame instance to submit

In [7]:
submit = pd.DataFrame(prediction)
submit.index.names = ["id"]
submit.columns = labels

In [8]:
submit

Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.001761,0.086275,0.000456,0.000346,0.041259,0.004843,0.002548,0.060574,0.004807,0.001310,...,0.000172,0.005093,0.000571,0.035265,0.000007,0.008218,0.050221,0.060365,0.047336,0.010931
1,0.001762,0.086264,0.000456,0.000346,0.041256,0.004844,0.002548,0.060568,0.004808,0.001310,...,0.000172,0.005094,0.000571,0.035263,0.000007,0.008219,0.050217,0.060359,0.047332,0.010939
2,0.001689,0.088961,0.000470,0.000313,0.042531,0.005001,0.002624,0.062450,0.004952,0.001349,...,0.000165,0.005256,0.000587,0.036350,0.000007,0.008481,0.051737,0.062235,0.048871,0.008531
3,0.001780,0.084177,0.000445,0.000360,0.040247,0.004716,0.002484,0.059094,0.004687,0.001277,...,0.000174,0.004961,0.000557,0.034398,0.000007,0.008007,0.049010,0.058891,0.046141,0.012297
4,0.001780,0.084177,0.000445,0.000360,0.040247,0.004716,0.002484,0.059094,0.004687,0.001277,...,0.000174,0.004961,0.000557,0.034398,0.000007,0.008007,0.049010,0.058891,0.046141,0.012297
5,0.001785,0.082644,0.000436,0.000367,0.039509,0.004624,0.002437,0.058014,0.004599,0.001253,...,0.000175,0.004866,0.000546,0.033767,0.000006,0.007854,0.048123,0.057814,0.045276,0.013129
6,0.001763,0.086006,0.000454,0.000348,0.041123,0.004824,0.002538,0.060379,0.004789,0.001305,...,0.000173,0.005073,0.000569,0.035148,0.000007,0.008187,0.050061,0.060172,0.047176,0.011124
7,0.001759,0.086326,0.000456,0.000345,0.041279,0.004845,0.002548,0.060606,0.004808,0.001310,...,0.000172,0.005095,0.000571,0.035282,0.000007,0.008221,0.050247,0.060398,0.047361,0.010893
8,0.001728,0.087798,0.000464,0.000330,0.041979,0.004931,0.002591,0.061637,0.004889,0.001332,...,0.000169,0.005184,0.000580,0.035879,0.000007,0.008365,0.051083,0.061425,0.048200,0.009693
9,0.001671,0.089351,0.000472,0.000306,0.042721,0.005027,0.002636,0.062727,0.004975,0.001355,...,0.000163,0.005282,0.000590,0.036513,0.000007,0.008522,0.051959,0.062511,0.049104,0.008079


### Testing the result data frame with cross-validation method

In [9]:
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss

def multiclass_logloss_score(model, features, labels, num_folds=5):
    kfolds = KFold(len(features), num_folds)

    total_score = 0.0

    for train_index, test_index in kfolds:
        train_features = features.iloc[train_index]
        test_features = features.iloc[test_index]
        train_labels = labels.iloc[train_index]
        test_labels = labels.iloc[test_index]

        model.fit(train_features, train_labels)
        prediction = model.predict_proba(test_features)

        score = log_loss(test_labels, prediction)
        total_score += score

    total_score = total_score / num_folds
    
    return total_score

### Saving the result data frame to the submission file.

In [10]:
from time import strftime, localtime

current_time = strftime("%Y.%m.%d %H.%M.%S", localtime())

submit.to_csv("RandomForestClassifier %s.csv" % current_time)

In [11]:
submit

Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.001761,0.086275,0.000456,0.000346,0.041259,0.004843,0.002548,0.060574,0.004807,0.001310,...,0.000172,0.005093,0.000571,0.035265,0.000007,0.008218,0.050221,0.060365,0.047336,0.010931
1,0.001762,0.086264,0.000456,0.000346,0.041256,0.004844,0.002548,0.060568,0.004808,0.001310,...,0.000172,0.005094,0.000571,0.035263,0.000007,0.008219,0.050217,0.060359,0.047332,0.010939
2,0.001689,0.088961,0.000470,0.000313,0.042531,0.005001,0.002624,0.062450,0.004952,0.001349,...,0.000165,0.005256,0.000587,0.036350,0.000007,0.008481,0.051737,0.062235,0.048871,0.008531
3,0.001780,0.084177,0.000445,0.000360,0.040247,0.004716,0.002484,0.059094,0.004687,0.001277,...,0.000174,0.004961,0.000557,0.034398,0.000007,0.008007,0.049010,0.058891,0.046141,0.012297
4,0.001780,0.084177,0.000445,0.000360,0.040247,0.004716,0.002484,0.059094,0.004687,0.001277,...,0.000174,0.004961,0.000557,0.034398,0.000007,0.008007,0.049010,0.058891,0.046141,0.012297
5,0.001785,0.082644,0.000436,0.000367,0.039509,0.004624,0.002437,0.058014,0.004599,0.001253,...,0.000175,0.004866,0.000546,0.033767,0.000006,0.007854,0.048123,0.057814,0.045276,0.013129
6,0.001763,0.086006,0.000454,0.000348,0.041123,0.004824,0.002538,0.060379,0.004789,0.001305,...,0.000173,0.005073,0.000569,0.035148,0.000007,0.008187,0.050061,0.060172,0.047176,0.011124
7,0.001759,0.086326,0.000456,0.000345,0.041279,0.004845,0.002548,0.060606,0.004808,0.001310,...,0.000172,0.005095,0.000571,0.035282,0.000007,0.008221,0.050247,0.060398,0.047361,0.010893
8,0.001728,0.087798,0.000464,0.000330,0.041979,0.004931,0.002591,0.061637,0.004889,0.001332,...,0.000169,0.005184,0.000580,0.035879,0.000007,0.008365,0.051083,0.061425,0.048200,0.009693
9,0.001671,0.089351,0.000472,0.000306,0.042721,0.005027,0.002636,0.062727,0.004975,0.001355,...,0.000163,0.005282,0.000590,0.036513,0.000007,0.008522,0.051959,0.062511,0.049104,0.008079


In [12]:
print(train_data.columns)
print(test_data.columns)

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')
Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y'], dtype='object')


In [13]:
print(len(train_data.Address.unique()))

23228
