In [None]:
# Import libraries
import numpy as np
import pandas as pd

In [None]:
# Read crime data
try:
    crime_data = pd.read_csv("train.csv", parse_dates=['Dates'])
    print "Data read successfully!"
except:
    print "Dataset could not be loaded. Is the dataset missing?"

In [None]:
crime_data.head()

In [None]:
# clean up some bad values, there are some locations in the dataset that are not in San Francisco
crime_data = crime_data[crime_data['Y'] < 38]

In [None]:
crimeCatagories = crime_data["Category"].unique()
crimeCatagories

In [None]:
columns_ = [
    'Dates', #timestamp of the crime incident
    #'Category', #category of the crime incident (only in train.csv). This is the target variable you are going to predict.
    'Descript', #detailed description of the crime incident (only in train.csv)
    #'DayOfWeek', #the day of the week
    #'PdDistrict', #name of the Police Department District
    'Resolution', #how the crime incident was resolved (only in train.csv)
    'Address', #the approximate street address of the crime incident 
    #'X', #Longitude
    #'Y' #Latitude
]

In [None]:
from copy import deepcopy
addresses=sorted(crime_data["Address"].unique())
categories=sorted(crime_data["Category"].unique())
C_counts=crime_data.groupby(["Category"]).size()
A_C_counts=crime_data.groupby(["Address","Category"]).size()
A_counts=crime_data.groupby(["Address"]).size()
logodds={}
logoddsPA={}
MIN_CAT_COUNTS=2
default_logodds=np.log(C_counts/len(crime_data))-np.log(1.0-C_counts/float(len(crime_data)))
for addr in addresses:
    PA=A_counts[addr]/float(len(crime_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    for cat in A_C_counts[addr].keys():
        if (A_C_counts[addr][cat]>MIN_CAT_COUNTS) and A_C_counts[addr][cat]<A_counts[addr]:
            PA=A_C_counts[addr][cat]/float(A_counts[addr])
            logodds[addr][categories.index(cat)]=np.log(PA)-np.log(1.0-PA)
    logodds[addr]=pd.Series(logodds[addr])
    logodds[addr].index=range(len(categories))

In [None]:
def get_season(x):
    summer=0
    fall=0
    winter=0
    spring=0
    if (x in [5, 6, 7]):
        summer=1
    if (x in [8, 9, 10]):
        fall=1
    if (x in [11, 0, 1]):
        winter=1
    if (x in [2, 3, 4]):
        spring=1
    return summer, fall, winter, spring

In [None]:
def PreProcess(data,test_data = False):


    #Convert Dates Column to Year, Month, Day, Hour individual columns
    data['Year'] = data['Dates'].map(lambda x: x.year)
    data['Month'] = data['Dates'].map(lambda x: x.month)
    data['Day'] = data['Dates'].map(lambda x: x.day) 
    data['Hour'] = data['Dates'].map(lambda x: x.hour) 
    
    data["Awake"]=data["Hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
    data["Summer"], data["Fall"], data["Winter"], data["Spring"]=zip(*data["Month"].apply(get_season))
    
    # Creating address features
    address_features=data["Address"].apply(lambda x: logodds[x])
    address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]
    
    data = pd.concat([data, address_features], axis=1)
    
    data["IsInterection"]=data["Address"].apply(lambda x: 1 if "/" in x else 0)
    data["logoddsPA"]=data["Address"].apply(lambda x: logoddsPA[x])
    
    #Drop unneccessary columns
    if not test_data:
        data = data.drop(columns_, 1)
    else:
        data = data.drop(['Id','Dates','Address'], 1)
            
    
    # Preprocess feature columns
    outX = pd.DataFrame(index=data.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in data.iteritems():
        # If non-numeric, convert to one or more dummy variables
        if (col_data.dtype == object):
            col_data = pd.get_dummies(col_data)  # e.g. 'PdDistrict

        outX = outX.join(col_data)  # collect column(s) in output dataframe
        
    #from sklearn.preprocessing import StandardScaler

    #stdsclr = StandardScaler()
    #outX[['Year','Month','Day', 'Hour','X','Y']] = stdsclr.fit_transform(outX[['Year','Month','Day', 'Hour','X','Y']])    

    return outX 

In [None]:
processed_crime_data = PreProcess(crime_data)
processed_crime_data.head()

In [None]:
target_cols = crimeCatagories
feature_cols = processed_crime_data.columns.difference(target_cols)

In [None]:
from sklearn.preprocessing import StandardScaler

stdsclr = StandardScaler()
processed_crime_data[feature_cols]= stdsclr.fit_transform(processed_crime_data[feature_cols])   

In [None]:
X_all = processed_crime_data[feature_cols]
X_all = X_all.sort_index(axis=1) #Sort Columns
y_all = processed_crime_data[target_cols]
y_all = y_all.sort_index(axis=1) #Sort Columns

In [None]:
X_all.head()

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,test_size=0.3)
print "Done split!"

print "Number of Training set: {}".format(len(X_train)) 
print "Number of Testing set: {}".format(len(X_test)) 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier

import datetime
a = datetime.datetime.now()
clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=50, learning_rate=1.0,max_depth=4, random_state=0))

clf.fit(X_train, y_train)
b = datetime.datetime.now()
print b - a
print "Done fitting!"

In [None]:
#Read Crime Test Data
crime_test_data = pd.read_csv("test.csv", parse_dates=['Dates'])
print "Data read successfully!"

In [None]:
new_addresses=sorted(crime_test_data["Address"].unique())
new_A_counts=crime_test_data.groupby("Address").size()
only_new=set(new_addresses+addresses)-set(addresses)
only_old=set(new_addresses+addresses)-set(new_addresses)
in_both=set(new_addresses).intersection(addresses)
for addr in only_new:
    PA=new_A_counts[addr]/float(len(crime_test_data)+len(crime_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    logodds[addr].index=range(len(categories))
for addr in in_both:
    PA=(A_counts[addr]+new_A_counts[addr])/float(len(crime_test_data)+len(crime_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)

In [None]:
processed_crime_test_data = PreProcess(crime_test_data, True)

In [None]:
processed_crime_test_data.head()

In [None]:
feature_cols = processed_crime_test_data.columns.tolist()
processed_crime_test_data[feature_cols]= stdsclr.fit_transform(processed_crime_test_data[feature_cols]) 

In [None]:
processed_crime_test_data = processed_crime_test_data.sort_index(axis=1) #Sort Columns
processed_crime_test_data.head()

In [None]:
y_pred_class_test = clf.predict_proba(processed_crime_test_data)
print "Prediction done!"  

In [None]:
print y_pred_class_test[6][100:125]

In [None]:
# convert to CSV
submission = pd.DataFrame(y_pred_class_test, columns=target_cols)
submission = submission.sort_index(axis=1)
#add Id column to the as the first column with datafarame index as its values.
submission.insert(0, 'Id',  processed_crime_test_data.index, allow_duplicates=False)

In [None]:
submission.to_csv('C:\Python\Submission.csv', index=False, header=True)
print "Done with Dataframe Conversion to a Csv File!"