In [1]:
import pandas as pd

In [2]:
raw_crimes = pd.read_csv('../data/crimeSample.csv')
raw_crimes[:5]

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,9993618,HY183647,03/13/2015 11:56:00 PM,022XX W 21ST ST,051A,ASSAULT,AGGRAVATED: HANDGUN,STREET,False,False,...,25,31,04A,1161498,1889981,2015,03/20/2015 12:42:30 PM,41.853785,-87.682726,"(41.853785453, -87.682726178)"
1,9993559,HY183629,03/13/2015 11:56:00 PM,049XX N AUSTIN AVE,2022,NARCOTICS,POSS: COCAINE,ALLEY,True,False,...,45,11,18,1135233,1932723,2015,03/20/2015 12:42:30 PM,41.97158,-87.778114,"(41.971580323, -87.778114286)"
2,9993811,HY183931,03/13/2015 11:52:00 PM,002XX N KILBOURN AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,28,26,14,1146318,1900711,2015,03/20/2015 12:42:30 PM,41.883532,-87.73817,"(41.883531998, -87.738169853)"
3,9993675,HY183675,03/13/2015 11:50:00 PM,046XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,True,...,4,39,08B,1185621,1874728,2015,03/20/2015 12:42:30 PM,41.811395,-87.594668,"(41.81139462, -87.594668281)"
4,9993703,HY183626,03/13/2015 11:48:00 PM,030XX N KOLMAR AVE,1330,CRIMINAL TRESPASS,TO LAND,RESIDENTIAL YARD (FRONT/BACK),False,False,...,31,20,26,1145566,1919717,2015,03/20/2015 12:42:30 PM,41.935701,-87.740449,"(41.935700728, -87.740448872)"


In [3]:
import time

def get_date_from_datetime(datetime):
    return datetime.split()[0]

relevant_columns = ['Primary Type', 'Community Area', 'Date']
trimmed_crimes = raw_crimes.reindex(columns=relevant_columns)
#trimmed_crimes['Date'] = trimmed_crimes['Date'].apply(lambda datetime_string: get_date_from_datetime(datetime_string))
trimmed_crimes[:5]

Unnamed: 0,Primary Type,Community Area,Date
0,ASSAULT,31,03/13/2015 11:56:00 PM
1,NARCOTICS,11,03/13/2015 11:56:00 PM
2,CRIMINAL DAMAGE,26,03/13/2015 11:52:00 PM
3,BATTERY,39,03/13/2015 11:50:00 PM
4,CRIMINAL TRESPASS,20,03/13/2015 11:48:00 PM


In [4]:
import csv

def bin_from_csv(csv_name, series_to_bin):
    with open(csv_name, 'rb') as bin_file:
        unbinned_to_binned = {}
        reader = csv.reader(bin_file)
        for line in reader:
            unbinned_to_binned[line[0]] = line[1]
    
    return series_to_bin.map(lambda unbinned: unbinned_to_binned[unbinned])

def bin_crimes(series):
    return bin_from_csv('../config/crime_bins.csv', series)



In [5]:
trimmed_crimes['Primary Type'] = bin_crimes(raw_crimes['Primary Type'])
trimmed_crimes[:5]

Unnamed: 0,Primary Type,Community Area,Date
0,Violent,31,03/13/2015 11:56:00 PM
1,Petty,11,03/13/2015 11:56:00 PM
2,Severe,26,03/13/2015 11:52:00 PM
3,Violent,39,03/13/2015 11:50:00 PM
4,Minor,20,03/13/2015 11:48:00 PM


In [6]:
violent_crimes = trimmed_crimes[trimmed_crimes['Primary Type'] == "Violent"]
del violent_crimes['Primary Type']
violent_crimes[:5]

Unnamed: 0,Community Area,Date
0,31,03/13/2015 11:56:00 PM
3,39,03/13/2015 11:50:00 PM
7,1,03/13/2015 11:45:00 PM
8,66,03/13/2015 11:45:00 PM
9,53,03/13/2015 11:44:00 PM


In [7]:
def reindex_by_date(data_frame):
    data_frame.index = pd.to_datetime(data_frame['Date'])
    
reindex_by_date(violent_crimes)
del violent_crimes['Date']

violent_crimes[:5]

Unnamed: 0_level_0,Community Area
Date,Unnamed: 1_level_1
2015-03-13 23:56:00,31
2015-03-13 23:50:00,39
2015-03-13 23:45:00,1
2015-03-13 23:45:00,66
2015-03-13 23:44:00,53


In [8]:
violent_crimes['Crimes'] = 1

violent_crimes[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Community Area,Crimes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-03-13 23:56:00,31,1
2015-03-13 23:50:00,39,1
2015-03-13 23:45:00,1,1
2015-03-13 23:45:00,66,1
2015-03-13 23:44:00,53,1


In [9]:
grouped_violent = violent_crimes.groupby('Community Area')

violent_crimes_by_location = {}

for location, data_for_location in grouped_violent:
    violent_crimes_by_location[location] = data_for_location.resample('D', how='sum')
    del violent_crimes_by_location[location]['Community Area']
    
for location in violent_crimes_by_location:
    violent_crimes_by_location[location]['Crime?'] = violent_crimes_by_location[location]['Crimes'].map(lambda num_crimes: num_crimes > 0)
    
violent_crimes_by_location

{1:             Crimes Crime?
 Date                     
 2014-09-28       1   True
 2014-09-29       2   True
 2014-09-30       3   True
 2014-10-01       4   True
 2014-10-02       4   True
 2014-10-03       6   True
 2014-10-04       3   True
 2014-10-05       1   True
 2014-10-06       2   True
 2014-10-07       7   True
 2014-10-08       4   True
 2014-10-09       1   True
 2014-10-10       2   True
 2014-10-11       2   True
 2014-10-12       2   True
 2014-10-13       1   True
 2014-10-14       6   True
 2014-10-15       2   True
 2014-10-16       1   True
 2014-10-17       1   True
 2014-10-18     NaN  False
 2014-10-19       3   True
 2014-10-20       5   True
 2014-10-21       1   True
 2014-10-22       5   True
 2014-10-23       3   True
 2014-10-24       2   True
 2014-10-25       4   True
 2014-10-26       1   True
 2014-10-27       3   True
 ...            ...    ...
 2015-02-12       3   True
 2015-02-13       2   True
 2015-02-14       2   True
 2015-02-15       3   Tru

In [10]:
for location in violent_crimes_by_location:
    del violent_crimes_by_location[location]['Crimes']

violent_crimes_by_location

{1:            Crime?
 Date             
 2014-09-28   True
 2014-09-29   True
 2014-09-30   True
 2014-10-01   True
 2014-10-02   True
 2014-10-03   True
 2014-10-04   True
 2014-10-05   True
 2014-10-06   True
 2014-10-07   True
 2014-10-08   True
 2014-10-09   True
 2014-10-10   True
 2014-10-11   True
 2014-10-12   True
 2014-10-13   True
 2014-10-14   True
 2014-10-15   True
 2014-10-16   True
 2014-10-17   True
 2014-10-18  False
 2014-10-19   True
 2014-10-20   True
 2014-10-21   True
 2014-10-22   True
 2014-10-23   True
 2014-10-24   True
 2014-10-25   True
 2014-10-26   True
 2014-10-27   True
 ...           ...
 2015-02-12   True
 2015-02-13   True
 2015-02-14   True
 2015-02-15   True
 2015-02-16  False
 2015-02-17   True
 2015-02-18  False
 2015-02-19   True
 2015-02-20   True
 2015-02-21   True
 2015-02-22   True
 2015-02-23   True
 2015-02-24   True
 2015-02-25   True
 2015-02-26   True
 2015-02-27   True
 2015-02-28   True
 2015-03-01   True
 2015-03-02   True
 2015-03-

In [13]:
from __future__ import division

baseline = {}
for location in range(1,78):
    df = violent_crimes_by_location[location]
    total_days = len(df.index)
    days_with_crime = len(df[df['Crime?'] == True])
    baseline[str(location)] = days_with_crime / total_days
    
baseline

{'1': 0.9041916167664671,
 '10': 0.5,
 '11': 0.4036144578313253,
 '12': 0.1509433962264151,
 '13': 0.27710843373493976,
 '14': 0.8023952095808383,
 '15': 0.8023952095808383,
 '16': 0.8203592814371258,
 '17': 0.6181818181818182,
 '18': 0.3353658536585366,
 '19': 0.9281437125748503,
 '2': 0.8734939759036144,
 '20': 0.6646706586826348,
 '21': 0.7425149700598802,
 '22': 0.8862275449101796,
 '23': 1.0,
 '24': 0.9457831325301205,
 '25': 1.0,
 '26': 0.9580838323353293,
 '27': 0.9578313253012049,
 '28': 0.9760479041916168,
 '29': 1.0,
 '3': 0.8614457831325302,
 '30': 0.9221556886227545,
 '31': 0.7964071856287425,
 '32': 0.9401197604790419,
 '33': 0.6204819277108434,
 '34': 0.41916167664670656,
 '35': 0.844311377245509,
 '36': 0.32098765432098764,
 '37': 0.4939759036144578,
 '38': 0.9161676646706587,
 '39': 0.562874251497006,
 '4': 0.6144578313253012,
 '40': 0.8975903614457831,
 '41': 0.49700598802395207,
 '42': 0.9580838323353293,
 '43': 1.0,
 '44': 0.9880239520958084,
 '45': 0.676646706586826

In [14]:
writer = csv.writer(open('../config/new_baseline.csv', 'w'))

In [15]:
for key in baseline:
    writer.writerow([key, str(baseline[key])])