In [1]:
import pandas
import numpy as np
import re

weather = pandas.read_excel('Baltimore_Weather_Data_3_Yr_Pnds_Adjst.xlsx')
df1 = pandas.read_csv('Calls_for_Disorderly.csv')
df2 = pandas.read_csv('Calls_for_Common_Assault.csv')
df3 = pandas.read_csv('Calls_for_Family_Disturb.csv')
df4 = pandas.read_csv('Calls_for_Behavioral_Crisis.csv')
crime  = pandas.concat([df1,df2,df3,df4])

In [2]:
#Let's start by trying to eliminate some incomplete data (We can actually save some usable data by commenting this out)
crime.dropna(subset = ['location'])

Unnamed: 0,callDateTime,priority,district,description,callNumber,incidentLocation,location
0,08/27/2015 12:31:00 PM,Medium,NE,DISORDERLY,P152391302,5000 LODESTONE WY,"5000 LODESTONE WY\nBALTIMORE, MD\n(39.321174, ..."
1,08/27/2015 12:43:00 PM,Medium,ND,DISORDERLY,P152391337,5100 YORK RD,"5100 YORK RD\nBALTIMORE, MD\n(39.350595, -76.6..."
2,08/27/2015 11:09:00 PM,Medium,SE,DISORDERLY,P152393662,2600 JEFFERSON ST,"2600 JEFFERSON ST\nBALTIMORE, MD\n(39.296826, ..."
3,09/01/2015 06:36:00 AM,Medium,ED,DISORDERLY,P152440510,1700 E OLIVER ST,"1700 E OLIVER ST\nBALTIMORE, MD\n(39.307481, -..."
4,07/16/2016 05:14:00 AM,Medium,CD,DISORDERLY,P161980556,1800 BLK N CALVERT ST,"1800 BLK N CALVERT ST\nBALTIMORE, MD\n(39.3100..."
6,08/27/2015 08:26:00 PM,Medium,NE,DISORDERLY,P152393030,1400 WALKER AV,"1400 WALKER AV\nBALTIMORE, MD\n(39.370561, -76..."
7,08/27/2015 11:10:00 PM,Medium,CD,DISORDERLY,P152393666,1500 PENNSYLVANIA AV,"1500 PENNSYLVANIA AV\nBALTIMORE, MD\n(39.30192..."
8,08/27/2015 12:32:00 PM,Medium,CD,DISORDERLY,P152391311,NB I 100 N AT W NORTH AV,"100 N AT W NORTH AV\nBALTIMORE, MD\n(39.311132..."
11,09/01/2015 12:26:00 AM,Medium,NW,DISORDERLY,P152440059,5500 HADDON AV,"5500 HADDON AV\nBALTIMORE, MD\n(39.337473, -76..."
12,08/27/2015 10:47:00 AM,Medium,CD,DISORDERLY,P152391003,200 N HOWARD ST,"200 N HOWARD ST\nBALTIMORE, MD\n(39.292176, -7..."


In [3]:
#Lets start preprocessing by stripping the lon-lat tags from the location and throwing them into their own field
#if any incomplete data exits it should be easily identifiable  in this field as a zero
trueLoc = []
for locs in crime['location']:
    if type(locs) == str:
        coords = (re.findall(r'[+-]?[0-9.]+', locs))
        if len(coords) >= 2:
            trueLoc.append( [float(coords[-2]), float(coords[-1])])
        else:
            trueLoc.append(0)
    else:
        trueLoc.append(0)
        
crime['Coordinates'] = trueLoc

In [4]:
#Now we jsut drop all rows where we can find those zeroes
crime[(crime != 0).all(1)]

Unnamed: 0,callDateTime,priority,district,description,callNumber,incidentLocation,location,Coordinates
0,08/27/2015 12:31:00 PM,Medium,NE,DISORDERLY,P152391302,5000 LODESTONE WY,"5000 LODESTONE WY\nBALTIMORE, MD\n(39.321174, ...","[39.321174, -76.555046]"
1,08/27/2015 12:43:00 PM,Medium,ND,DISORDERLY,P152391337,5100 YORK RD,"5100 YORK RD\nBALTIMORE, MD\n(39.350595, -76.6...","[39.350595, -76.609722]"
2,08/27/2015 11:09:00 PM,Medium,SE,DISORDERLY,P152393662,2600 JEFFERSON ST,"2600 JEFFERSON ST\nBALTIMORE, MD\n(39.296826, ...","[39.296826, -76.58032]"
3,09/01/2015 06:36:00 AM,Medium,ED,DISORDERLY,P152440510,1700 E OLIVER ST,"1700 E OLIVER ST\nBALTIMORE, MD\n(39.307481, -...","[39.307481, -76.594228]"
4,07/16/2016 05:14:00 AM,Medium,CD,DISORDERLY,P161980556,1800 BLK N CALVERT ST,"1800 BLK N CALVERT ST\nBALTIMORE, MD\n(39.3100...","[39.310009, -76.613727]"
6,08/27/2015 08:26:00 PM,Medium,NE,DISORDERLY,P152393030,1400 WALKER AV,"1400 WALKER AV\nBALTIMORE, MD\n(39.370561, -76...","[39.370561, -76.585971]"
7,08/27/2015 11:10:00 PM,Medium,CD,DISORDERLY,P152393666,1500 PENNSYLVANIA AV,"1500 PENNSYLVANIA AV\nBALTIMORE, MD\n(39.30192...","[39.301922, -76.632707]"
8,08/27/2015 12:32:00 PM,Medium,CD,DISORDERLY,P152391311,NB I 100 N AT W NORTH AV,"100 N AT W NORTH AV\nBALTIMORE, MD\n(39.311132...","[39.311132, -76.618077]"
11,09/01/2015 12:26:00 AM,Medium,NW,DISORDERLY,P152440059,5500 HADDON AV,"5500 HADDON AV\nBALTIMORE, MD\n(39.337473, -76...","[39.337473, -76.703891]"
12,08/27/2015 10:47:00 AM,Medium,CD,DISORDERLY,P152391003,200 N HOWARD ST,"200 N HOWARD ST\nBALTIMORE, MD\n(39.292176, -7...","[39.292176, -76.619664]"


In [5]:
#Some data in the description has the same names differentiated by case. let's normalize this
names = []
for type in crime['description']:
    names.append(type.upper())
crime['description'] = names

In [13]:
#Now we'll encode our data so we have our four basic types BEHAVIORAL_CRISIS:0 COMMON ASSAULT:1 DISORDERLY:2 FAMILY_DISTURB:3  
#crime["description"] 
print crime["description"].unique()
print (crime["description"].astype('category').cat.codes).unique()

crime["description"] = crime["description"].astype('category').cat.codes

['DISORDERLY' 'COMMON ASSAULT' 'FAMILY DISTURB' 'BEHAVIOR CRISIS']
[2 1 3 0]


In [108]:
print weather

            Date  High_Temp  Avg_Temp  Low_Temp  High_Dew_Point  \
0       1/1/2015         43        30        17              20   
1       1/2/2015         46        36        26              22   
2       1/3/2015         39        33        26              39   
3       1/4/2015         62        51        39              54   
4       1/5/2015         47        36        24              29   
5       1/6/2015         28        21        14              21   
6       1/7/2015         24        18        11              14   
7       1/8/2015         22        14         6              -1   
8       1/9/2015         37        30        22              12   
9      1/10/2015         25        18        10               4   
10     1/11/2015         37        30        22              26   
11     1/12/2015         40        36        32              37   
12     1/13/2015         39        32        24              32   
13     1/14/2015         31        27        23              2