In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# Read crime data

try:
    crime_data = pd.read_csv("train.csv", parse_dates=['Dates'])
    print "Data read successfully!"
except:
    print "Dataset could not be loaded. Is the dataset missing?"

Data read successfully!


In [3]:
crime_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
Catagories = crime_data["Category"].unique()
Catagories

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE',
       'PORNOGRAPHY/OBSCENE MAT'], dtype=object)

In [5]:
crime_data.columns.values

array(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'], dtype=object)

In [6]:
columns_ = [
    'Dates', #timestamp of the crime incident
    #'Category', #category of the crime incident (only in train.csv). This is the target variable you are going to predict.
    'Descript', #detailed description of the crime incident (only in train.csv)
    'DayOfWeek', #the day of the week
    #'PdDistrict', #name of the Police Department District
    'Resolution', #how the crime incident was resolved (only in train.csv)
    'Address', #the approximate street address of the crime incident 
    #'X', #Longitude
    #'Y' #Latitude
]

In [7]:
def PreProcess(data,t_data = False):


    #Convert Dates Column to Year, Month, Day, Hour individual columns
    data['Year'] = data['Dates'].map(lambda x: x.year)
    data['Month'] = data['Dates'].map(lambda x: x.month)
    data['Day'] = data['Dates'].map(lambda x: x.day) 
    data['DayNumber'] = data['Dates'].map(lambda x: x.dayofweek) 
    data['Hour'] = data['Dates'].map(lambda x: x.hour) 
    data['Dark'] = data['Dates'].apply(lambda x: 1 if (x.hour >= 21 or x.hour < 5) else 0)
    
    #Drop unneccessary columns
    if not t_data:
        data = data.drop(columns_, 1)
    else:
        data = data.drop(['Id','Dates','Address'], 1)
            
    
    # Preprocess feature columns
    outX = pd.DataFrame(index=data.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in data.iteritems():
        # If non-numeric, convert to one or more dummy variables
        if (col_data.dtype == object):
            col_data = pd.get_dummies(col_data)  # e.g. 'action_type' => 'action_type_Jump Shot', 
                                                             #'action_type_Driving Dunk Shot'

        outX = outX.join(col_data)  # collect column(s) in output dataframe
        
    #from sklearn.preprocessing import StandardScaler

    #stdsclr = StandardScaler()
    #outX[['Year','Month','Day', 'Hour','X','Y']] = stdsclr.fit_transform(outX[['Year','Month','Day', 'Hour','X','Y']])    

    return outX 

In [8]:
processed_crime_data = PreProcess(crime_data)
processed_crime_data.head()


Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,TARAVAL,TENDERLOIN,X,Y,Year,Month,Day,DayNumber,Hour,Dark
0,0,0,0,0,0,0,0,0,0,0,...,0,0,-122.425892,37.774599,2015,5,13,2,23,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,-122.425892,37.774599,2015,5,13,2,23,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,-122.424363,37.800414,2015,5,13,2,23,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,-122.426995,37.800873,2015,5,13,2,23,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,-122.438738,37.771541,2015,5,13,2,23,1


In [9]:
Catagories = crime_data["PdDistrict"].unique()
Catagories

array(['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL',
       'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN'], dtype=object)

In [10]:
feature_cols = ['X','Y','Year','Month','Day','Hour','Dark','DayNumber',
                'NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL','TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']
target_cols = processed_crime_data.columns.difference(feature_cols)
print target_cols

Index([u'ARSON', u'ASSAULT', u'BAD CHECKS', u'BRIBERY', u'BURGLARY',
       u'DISORDERLY CONDUCT', u'DRIVING UNDER THE INFLUENCE', u'DRUG/NARCOTIC',
       u'DRUNKENNESS', u'EMBEZZLEMENT', u'EXTORTION', u'FAMILY OFFENSES',
       u'FORGERY/COUNTERFEITING', u'FRAUD', u'GAMBLING', u'KIDNAPPING',
       u'LARCENY/THEFT', u'LIQUOR LAWS', u'LOITERING', u'MISSING PERSON',
       u'NON-CRIMINAL', u'OTHER OFFENSES', u'PORNOGRAPHY/OBSCENE MAT',
       u'PROSTITUTION', u'RECOVERED VEHICLE', u'ROBBERY', u'RUNAWAY',
       u'SECONDARY CODES', u'SEX OFFENSES FORCIBLE',
       u'SEX OFFENSES NON FORCIBLE', u'STOLEN PROPERTY', u'SUICIDE',
       u'SUSPICIOUS OCC', u'TREA', u'TRESPASS', u'VANDALISM', u'VEHICLE THEFT',
       u'WARRANTS', u'WEAPON LAWS'],
      dtype='object')


In [11]:
X_all = processed_crime_data[feature_cols]
X_all = X_all.sort_index(axis=1) #Sort Columns
X_all.head()

Unnamed: 0,BAYVIEW,CENTRAL,Dark,Day,DayNumber,Hour,INGLESIDE,MISSION,Month,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,X,Y,Year
0,0,0,1,13,2,23,0,0,5,1,0,0,0,0,0,-122.425892,37.774599,2015
1,0,0,1,13,2,23,0,0,5,1,0,0,0,0,0,-122.425892,37.774599,2015
2,0,0,1,13,2,23,0,0,5,1,0,0,0,0,0,-122.424363,37.800414,2015
3,0,0,1,13,2,23,0,0,5,1,0,0,0,0,0,-122.426995,37.800873,2015
4,0,0,1,13,2,23,0,0,5,0,1,0,0,0,0,-122.438738,37.771541,2015


In [12]:
X_all.columns

Index([u'BAYVIEW', u'CENTRAL', u'Dark', u'Day', u'DayNumber', u'Hour',
       u'INGLESIDE', u'MISSION', u'Month', u'NORTHERN', u'PARK', u'RICHMOND',
       u'SOUTHERN', u'TARAVAL', u'TENDERLOIN', u'X', u'Y', u'Year'],
      dtype='object')

In [13]:
y_all = processed_crime_data[target_cols]
y_all.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
y_all.columns

Index([u'ARSON', u'ASSAULT', u'BAD CHECKS', u'BRIBERY', u'BURGLARY',
       u'DISORDERLY CONDUCT', u'DRIVING UNDER THE INFLUENCE', u'DRUG/NARCOTIC',
       u'DRUNKENNESS', u'EMBEZZLEMENT', u'EXTORTION', u'FAMILY OFFENSES',
       u'FORGERY/COUNTERFEITING', u'FRAUD', u'GAMBLING', u'KIDNAPPING',
       u'LARCENY/THEFT', u'LIQUOR LAWS', u'LOITERING', u'MISSING PERSON',
       u'NON-CRIMINAL', u'OTHER OFFENSES', u'PORNOGRAPHY/OBSCENE MAT',
       u'PROSTITUTION', u'RECOVERED VEHICLE', u'ROBBERY', u'RUNAWAY',
       u'SECONDARY CODES', u'SEX OFFENSES FORCIBLE',
       u'SEX OFFENSES NON FORCIBLE', u'STOLEN PROPERTY', u'SUICIDE',
       u'SUSPICIOUS OCC', u'TREA', u'TRESPASS', u'VANDALISM', u'VEHICLE THEFT',
       u'WARRANTS', u'WEAPON LAWS'],
      dtype='object')

In [15]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,test_size=0.3)
print "Done split!"

print "Number of Training set: {}".format(len(X_train)) 
print "Number of Testing set: {}".format(len(X_test)) 

Done split!
Number of Training set: 614634
Number of Testing set: 263415


In [31]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier

import datetime
a = datetime.datetime.now()
clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=1, random_state=0))

#************************************************************************************************************************
# clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0))*
# Scored  2.49829                                                                                                       * 
# Time to fit: 0:25:35.935000 - Laptop                                                                                          *
#Training Score = 0.00750690654926                                                                                      *
#Testing Score = 0.00744832298844                                                                                       * 
#************************************************************************************************************************

#************************************************************************************************************************
# clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=200, learning_rate=1.0,max_depth=4, random_state=0))*
# Scored  3.38053                                                                                                      * 
# Time to fit: 3:12:49.177000 - Laptop                                                                                          *
#Training Score = 0.0317652456584                                                                                      *
#Testing Score = 0.0277205170548                                                                                      * 
#************************************************************************************************************************


#************************************************************************************************************************
#clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=1, random_state=0))*
# Scored  2.49270                                                                                                      * 
# Time to fit: 1:23:42.299000 - AWS Server                                                                                          *
#Training Score = 0.0108731375095                                                                                    *
#Testing Score = 0.0107852627982                                                                                   * 
#************************************************************************************************************************

clf.fit(X_train, y_train)
b = datetime.datetime.now()
print b - a
print "Done fitting!"

3:12:49.177000
Done fitting!


In [32]:
t_score = clf.score(X_train,y_train)
print "Training Score = {}".format(t_score)

tt_score = clf.score(X_test,y_test)
print "Testing Score = {}".format(tt_score)

Training Score = 0.0317652456584
Testing Score = 0.0277205170548


In [33]:
#Read Crime Test Data
crime_test_data = pd.read_csv("test.csv", parse_dates=['Dates'])
print "Data read successfully!"

Data read successfully!


In [34]:
processed_crime_test_data = PreProcess(crime_test_data, True)

processed_crime_test_data.drop(['Friday','Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'], axis = 1, inplace = True)

processed_crime_test_data = processed_crime_test_data.sort_index(axis=1)

processed_crime_test_data.head()

Unnamed: 0,BAYVIEW,CENTRAL,Dark,Day,DayNumber,Hour,INGLESIDE,MISSION,Month,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,X,Y,Year
0,1,0,1,10,6,23,0,0,5,0,0,0,0,0,0,-122.399588,37.735051,2015
1,1,0,1,10,6,23,0,0,5,0,0,0,0,0,0,-122.391523,37.732432,2015
2,0,0,1,10,6,23,0,0,5,1,0,0,0,0,0,-122.426002,37.792212,2015
3,0,0,1,10,6,23,1,0,5,0,0,0,0,0,0,-122.437394,37.721412,2015
4,0,0,1,10,6,23,1,0,5,0,0,0,0,0,0,-122.437394,37.721412,2015


In [35]:
y_pred_class_test = clf.predict_proba(processed_crime_test_data)
print "Prediction done!"  

Prediction done!


In [41]:
# convert to CSV
submission = pd.DataFrame(y_pred_class_test, columns=target_cols)
#add Id column to the as the first column with datafarame index as its values.
submission.insert(0, 'Id',  processed_crime_test_data.index, allow_duplicates=False)
submission.to_csv('Submission.csv', index=False, header=True)
print "Done with Dataframe Conversion to a Csv File!"


Done with Dataframe Conversion to a Csv File!


In [40]:
submission.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.004095,0.111813,0.00021,0.002955,0.012415,0.00072,0.005805,0.008901,0.000164,...,0.0,0.007404,0.000162,0.033315,6.7547150000000005e-93,0.001977,0.057699,0.095133,0.012109,0.023149
1,1,0.003701,0.115177,9e-05,0.002955,0.006539,0.000741,0.009529,0.037163,0.000216,...,0.0,0.005407,0.000162,0.033803,6.7547150000000005e-93,0.003171,0.008378,0.096405,0.03558,0.01891
2,2,0.001172,0.042329,2e-06,0.000183,0.070425,0.001263,0.002322,0.01454,0.00284,...,2.753244e-08,0.003083,0.000234,0.010386,6.7547150000000005e-93,0.013822,0.300273,0.096765,0.007074,0.003857
3,3,0.001599,0.034558,1e-06,0.003624,0.004457,0.002552,0.001899,0.007237,0.003643,...,1.082931e-11,0.007339,0.000459,0.030618,6.7547150000000005e-93,0.003003,0.056661,0.057951,0.027385,0.036527
4,4,0.001599,0.034558,1e-06,0.003624,0.004457,0.002552,0.001899,0.007237,0.003643,...,1.082931e-11,0.007339,0.000459,0.030618,6.7547150000000005e-93,0.003003,0.056661,0.057951,0.027385,0.036527
