In [87]:
import pandas as pd
import geopandas as gpd
import requests, json
%matplotlib inline

AAnn = pd.read_csv('Austin_Annual_Crime_Dataset_2015.csv')
AUF2015= pd.read_csv("Austin_UF_R2R_2015.csv")

#When writing actual script, need to 'within line in open(...) and call directly from internet.

In [180]:
AAnn = pd.read_csv('Austin_Annual_Crime_Dataset_2015.csv')
AUF2015= pd.read_csv("Austin_UF_R2R_2015.csv")

#Renaming columns
AUF2015.rename(columns={' Primary Key': 'Key', ' Effect on Officer': ' OfficerEffects', 'Nature of Contact':'NatureOfContact', 'Officer Yrs of Service': 'OfficerYrsServ'}, inplace=True)
AAnn.rename(columns={'HighestNIBRS/UCROffenseDescription':'NIBRS', 'Council District': 'Council_District'}, inplace=True)

#Removing spaces in column names
AUF2015.columns = AUF2015.columns.str.replace('\s+','')
AAnn.columns = AAnn.columns.str.replace('\s+','')

#Dropping duplicates from AUF2015 set. Only AUF2015 has duplicates.
AUF2015 = AUF2015.drop_duplicates(subset='Key', keep='first', inplace = False)

#Merging datasets 
stack = pd.merge(AAnn, AUF2015, left_on='GOPrimaryKey', right_on='Key', how='outer')

#Removing hyphens from index names
stack.rename(columns={'X-Coordinate':'XCoord', 'Y-Coordinate':'YCoord'}, inplace = True)

#Boolean for identifying UF incidents
stack['UF'] = stack['AreaCommand'].notnull() | (stack['Key'].notnull() & stack['GOPrimaryKey'].notnull())

#Putting info that was present in both datasets into same columns (keys, council districts, and geo coordinates)
stack['Key'].fillna(stack['GOPrimaryKey'], inplace=True)
stack.drop(['GOPrimaryKey'], axis = 1, inplace = True)

stack['CouncilDistrict'].fillna(stack['Council_District'], inplace=True)
stack.drop(['Council_District'], axis = 1, inplace = True)

stack['XCoord'].fillna(stack['GOXCoordinate'], inplace=True)
stack.drop(['GOXCoordinate'], axis = 1, inplace = True)

stack['YCoord'].fillna(stack['GOYCoordinate'], inplace=True)
stack.drop(['GOYCoordinate'], axis = 1, inplace = True)

#Reordering index for ease of navigating dataset
stack = stack.reindex_axis(['Key','CouncilDistrict','UF','XCoord','YCoord','RIN',
                           'DateOccurred','TimeOccurred','ClearanceDate', 'GOReportDate',
                           'R2RLevel','NIBRS','AreaCommand',  'Location', 'GOCensusTract',
                           'GODistrict','GOLocation','GOLocationZip','OfficerEffects','OfficerCommissionDate',
                           'OfficerYrsServ', 'OfficerOrganizationDesc', 'ReasonDesc','SubjectConductDesc', 'SubjectEffects',
                           'SubjectEthnicity', 'SubjectRace', 'SubjectResistance', 'SubjectSex', 'NatureOfContact',
                           'GOHighestOffenseDesc', 'NumberShots', 'WeaponUsed1', 'WeaponUsed2', 'WeaponUsed3', 'WeaponUsed4',
                           'WeaponUsed5', 'ClearanceStatus','MasterSubjectID'], axis=1)


#Sorting by council district
stack.sort_values(('CouncilDistrict'), inplace = True)

#Resetting index to Key
stack.set_index(['Key'], drop = False, inplace = True)

#Saving to csv
#stack.to_csv('stack.csv')


In [None]:
#Scratch work/past work/notes from here on out:

In [179]:
print(AUF2015.keys())
print(AAnn.keys())
print(stack.keys())

Index(['RIN', 'Key', 'DateOccurred', 'TimeOccurred', 'Location', 'AreaCommand',
       'NatureofContact', 'ReasonDesc', 'R2RLevel', 'MasterSubjectID',
       'SubjectSex', 'SubjectRace', 'SubjectEthnicity', 'SubjectConductDesc',
       'SubjectResistance', 'WeaponUsed1', 'WeaponUsed2', 'WeaponUsed3',
       'WeaponUsed4', 'WeaponUsed5', 'NumberShots', 'SubjectEffects',
       'OfficerEffects', 'OfficerOrganizationDesc', 'OfficerCommissionDate',
       'OfficerYrsServ', 'X-Coordinate', 'Y-Coordinate', 'CouncilDistrict'],
      dtype='object')
Index(['GOPrimaryKey', 'Council_District', 'GOHighestOffenseDesc',
       'HighestNIBRS/UCROffenseDescription', 'GOReportDate', 'GOLocation',
       'ClearanceStatus', 'ClearanceDate', 'GODistrict', 'GOLocationZip',
       'GOCensusTract', 'GOXCoordinate', 'GOYCoordinate'],
      dtype='object')
Index(['Key', 'CouncilDistrict', 'UF', 'XCoord', 'YCoord', 'RIN',
       'DateOccurred', 'TimeOccurred', 'ClearanceDate', 'GOReportDate',
       'R2RLevel'

In [176]:
#Creating dictionary of common keys (incident identifiers) between datasets
AAnn.keys()

DupKey=set(AAnn['GOPrimaryKey']) & set(AUF2015['Key'])
KeyDict = dict.fromkeys(DupKey)
#print(KeyDict)
dupkey = KeyDict.keys()
dupkeylist = list(dupkey)
print(dupkeylist)
len(dupkeylist)

false=stack['Key'].isin(dupkeylist)
false.value_counts()

In [None]:
#Finding NaNs in datasets

#How many NaNs are in each column?
AAnn.isnull().sum()
stack.Key.isnull().sum()

#Create table of rows with NaNs
AAnn_nan_rows = AAnn[AAnn.isnull().T.any().T]
AUF2015_nan_rows = AUF2015[AUF2015.isnull().T.any().T]

#Where specifically are the NaNs? Create table of Boolean true/false. True = NaN
#False = not a NaN
AnnNaNLoc= AAnn.isnull()
AUF2015NanLoc= AUF2015.isnull()

stack_nan = stack[stack.isnull().T.any().T]

In [None]:
#Finding NaNs continued

AAnn.isnull().sum()

#AAnn
AAnn_vars = ['GOPrimaryKey', 'GOHighestOffenseDesc', 'GO', 'GOReportDate', 
             'GOLocation','GOLocationZip', 'GOCensusTract', 'GOXCoordinate', 'GOYCoordinate']

#AUF2015
AUF2015_vars = ['PrimaryKey', 'DateOccurred', 'TimeOccurred', 'Location', 'NatureOfContact', 
                'SubjectEffects', 'OfficerOrganizationDesc', 'OfficerCommissionDate', 'OfficerYrsofService', 
                'X-Coordinate', 'Y-Coordinate'] 

#Printing 

#To find out if the columns we care about have NaNs in them:

#AAnn
AAnn_nan_rows_GOPrimaryKey = AAnn[AAnn['GOPrimaryKey'].isnull()]
AAnn_nan_rows_GOHighestOffenseDesc = AAnn[AAnn['GOHighestOffenseDesc'].isnull()]
AAnn_nan_rows_HighNIBRS = AAnn[AAnn['HighestNIBRS/UCROffenseDescription'].isnull()]
AAnn_nan_rows_GOReportDate = AAnn[AAnn['GOReportDate'].isnull()]
AAnn_nan_rows_GOLocation = AAnn[AAnn['GOLocation'].isnull()]
AAnn_nan_rows_GOLocationZip = AAnn[AAnn['GOLocationZip'].isnull()]
AAnn_nan_rows_GOCensusTract = AAnn[AAnn['GOCensusTract'].isnull()]
AAnn_nan_rows_GOXCoordinate = AAnn[AAnn['GOXCoordinate'].isnull()]
AAnn_nan_rows_GOYCoordinate = AAnn[AAnn['GOYCoordinate'].isnull()]

#AUF2015
AUF2015_nan_rows_PrimaryKey = AUF2015[AUF2015['PrimaryKey'].isnull()]
AUF2015_nan_rows_DateOccurred = AUF2015[AUF2015['DateOccurred'].isnull()]
AUF2015_nan_rows_TimeOccurred = AUF2015[AUF2015['TimeOccurred'].isnull()]
AUF2015_nan_rows_Location = AUF2015[AUF2015['Location'].isnull()]
AUF2015_nan_rows_NatureofContact = AUF2015[AUF2015['NatureofContact'].isnull()]
AUF2015_nan_rows_SubjectEffects = AUF2015[AUF2015['SubjectEffects'].isnull()]
AUF2015_nan_rows_OfficerOrganizationDesc = AUF2015[AUF2015['OfficerOrganizationDesc'].isnull()]
AUF2015_nan_rows_OfficerCommissionDate = AUF2015[AUF2015['OfficerCommissionDate'].isnull()]
AUF2015_nan_rows_OfficerYrsofService = AUF2015[AUF2015['OfficerYrsofService'].isnull()]
AUF2015_nan_rows_XCoordinate = AUF2015[AUF2015['X-Coordinate'].isnull()]
AUF2015_nan_rows_YCoordinate = AUF2015[AUF2015['Y-Coordinate'].isnull()]

In [None]:
#Checking to make sure we deleted the correct columns

lenstack= len(stack)
print('lenstack', lenstack)

lenDupKey= len(DupKey)
print('lenDupKey',lenDupKey) #Is DupKey - which all of the other dup variables are based on - only a list of the unique doubles?
#As in not accounting for multiple doubles in AUF2015?

lenAAnn = len(AAnn)
print('lenAAnn',lenAAnn)

lenAUF2015 = len(AUF2015)
print('lenAUF2015',lenAUF2015)

lenAAnnAUF=lenAUF2015 + lenAAnn
print('lenAAnnAUF',lenAAnnAUF)

lenPreMerge_stack= lenAAnnAUF - lenstack
print('lenPreMerge_stack', lenAAnnAUF- lenstack) #(length of the AUF2015 + AAnn) - (length of stack after the merge - presumably
#after it's gotten rid of all duplicates except for the one.)

lenstackb= len(stackb)
print('lenstackb', lenstackb) #Same length as AUF2015 + AAnn. This is good, but may indicate that there are still duplicates
#in the stack dataste.

stackNonUnique = stack.Key.value_counts() # of non-unique values in Key:
#print('stackNonUnique', stackNonUnique)

lenstackNU= len(stackNonUnique)
print('lenstackNU', lenstackNU) #Why is this less than 

lenstack_lenstackNU = lenstack - lenstackNU
print('lenstack_lenstackNU', lenstack_lenstackNU)

stack['Key'].isin(dupkeylist)

In [None]:
#Old way to combine datasets
stackb = pd.concat([AAnn, AUF2015], axis=0)

In [29]:
#Just some info for us:

print('There are', len(set(AAnn['GOPrimaryKey'])), 'incidents in AAnn 2015 annual crime')
print('There are', len(set(AUF2015['PrimaryKey'])), 'incidents in AUF2015 use of force')
print('There are', len(KeyDict),'common incidents between AAnn and AUF2015')
PercentCommonFromUF= len(KeyDict)/(len(set(AUF2015['PrimaryKey'])))
print('This means', PercentCommonFromUF, '% of UF incidents are recorded in AAnn annual crime dataset.')

In [None]:
#Geo

APDdis=gpd.read_file("APD Districts.geojson")
AUF2015.head(5)

APDdis.head(2)
APDdis.district_name.unique()
#SMC= gpd.read_file("Single Member Council Districts.geojson")
SMC.plot()
#APDdis

Unnamed: 0,Key,CouncilDistrict,UF,XCoord,YCoord,RIN,DateOccurred,TimeOccurred,ClearanceDate,GOReportDate,...,NatureOfContact,GOHighestOffenseDesc,NumberShots,WeaponUsed1,WeaponUsed2,WeaponUsed3,WeaponUsed4,WeaponUsed5,ClearanceStatus,MasterSubjectID
3292,20153640000.0,9.0,False,3115522.0,10070515.0,170264.0,12/30/2015 12:00:00 AM,245.0,3-Feb-15,2-Feb-15,...,,THEFT,,WEAPONLESS (PRESSURE POINTS/KICKS/HAND),,,,,N,459357086: 20153640113
3296,20153650000.0,9.0,False,3115416.0,10070416.0,169873.0,12/31/2015 12:00:00 AM,,12-Feb-15,2-Feb-15,...,,THEFT,,CHEMICAL AGENT - O C SPRAY,,,,,N,211288398: 20153652230
3298,20153650000.0,1.0,False,3121574.0,10071826.0,170006.0,12/31/2015 12:00:00 AM,1550.0,16-Mar-15,2-Feb-15,...,,THEFT,,WEAPONLESS (PRESSURE POINTS/KICKS/HAND),,,,,N,458987405: 20153650995
3299,20153650000.0,3.0,False,3117764.0,10056344.0,170046.0,12/31/2015 12:00:00 AM,2337.0,11-Mar-15,2-Feb-15,...,,THEFT,,WEAPONLESS (PRESSURE POINTS/KICKS/HAND),,,,,N,459358256: 20153651957
3301,20153650000.0,8.0,False,3074396.0,10053463.0,170104.0,12/31/2015 12:00:00 AM,1353.0,28-Apr-15,2-Feb-15,...,,BREACH OF COMPUTER SECURITY,,WEAPONLESS (PRESSURE POINTS/KICKS/HAND),,,,,N,253427997: 20153650763


In [None]:
#Merging script with old stuff in it (e.g. concat)

#Renaming columns
AUF2015.rename(columns={' Primary Key': 'Key', ' Effect on Officer': ' OfficerEffects', 'Nature of Contact':'NatureOfContact', 'Officer Yrs of Service': 'OfficerYrsServ'}, inplace=True)
AAnn.rename(columns={'HighestNIBRS/UCROffenseDescription':'NIBRS', 'Council District': 'Council_District'}, inplace=True)

#Removing spaces in column names
AUF2015.columns = AUF2015.columns.str.replace('\s+','')
AAnn.columns = AAnn.columns.str.replace('\s+','')

#Dropping duplicates from AUF2015 set
AUF2015 = AUF2015.drop_duplicates(subset='Key', keep='first', inplace = False)

#List of AAnn keys for referencing 
#AAnnKeys = AAnn.keys().tolist()
#AAnnKeys2= [11, 4, 8, 18, 12, 9, 2, 1, 7, 10, 6, 13, 14]
    
stack = pd.merge(AAnn, AUF2015, left_on='GOPrimaryKey', right_on='Key', how='outer')

#Combining datasets
#stack = pd.merge([AAnn, AUF2015], a)

#Removing hyphens from index names
stack.rename(columns={'X-Coordinate':'XCoord', 'Y-Coordinate':'YCoord'}, inplace = True)

#Creating Boolean for UF incidents
stack['UF'] = stack['AreaCommand'].notnull() | (stack['Key'].notnull() & stack['GOPrimaryKey'].notnull())
#stack['AreaCommand'].notnull() | 

#Copying the info from the AAnn report into the row with the AUF2015 report. 
#for r in stack['Key'].isin(dupkeylist):
#    if r == True:
#        for x in AAnnKeys:
#            stack[x].fillna(AAnn[x], inplace = True)

#Merging key and council district info
stack['Key'].fillna(stack['GOPrimaryKey'], inplace=True)
stack.drop(['GOPrimaryKey'], axis = 1, inplace = True)

stack['CouncilDistrict'].fillna(stack['Council_District'], inplace=True)
stack.drop(['Council_District'], axis = 1, inplace = True)

stack['XCoord'].fillna(stack['GOXCoordinate'], inplace=True)
stack.drop(['GOXCoordinate'], axis = 1, inplace = True)

stack['YCoord'].fillna(stack['GOYCoordinate'], inplace=True)
stack.drop(['GOYCoordinate'], axis = 1, inplace = True)

#Dropping the crime reports that are present in both AAnn and AUF2015
stackTest = stack.drop_duplicates(subset='Key', keep='last', inplace = False)

#Reordering index for ease of navigating dataset
#stack = stack.reindex_axis(['Key','CouncilDistrict','UF','XCoord','YCoord','RIN',
                           # 'DateOccurred','TimeOccurred','ClearanceDate', 'GOReportDate',
                            #'R2RLevel','NIBRS','AreaCommand',  'Location', 'GOCensusTract',
                            #'GODistrict','GOLocation','GOLocationZip','OfficerEffects','OfficerCommissionDate',
                            #'OfficerYrsServ', 'OfficerOrganizationDesc', 'ReasonDesc','SubjectConductDesc', 'SubjectEffects',
                            #'SubjectEthnicity', 'SubjectRace', 'SubjectResistance', 'SubjectSex', 'NatureOfContact',
                            #'GOHighestOffenseDesc', 'NumberShots', 'WeaponUsed1', 'WeaponUsed2', 'WeaponUsed3', 'WeaponUsed4',
                            #'WeaponUsed5', 'ClearanceStatus','MasterSubjectID'], axis=1)


#Sorting by council district
#stack.sort_values(('CouncilDistrict'), inplace = True)

#Resetting index to Key
#stack.set_index(['Key'], drop = False, inplace = True)

#Saving to csv
#stack.to_csv('stack.csv')

#Saving to csv
#stack.to_csv('stackTestAgain.csv')