In [1]:
import pandas as pd

#All Functons Used

def combine(dataSet, combine):
    #Sets all crimes as non forcible
    dataSet.loc[dataSet.Forcible == 'Y', 'Forcible']= 'N'
    
    #If a crime has a negative indicator then remove
    dataSet.loc[(dataSet.Offense.str.contains('NO ASSAULT') == True), 'Offense']= 'N'
    
    #Reported as Section can sometimes differ from the offense Rape is included in sexual
    dataSet.loc[(dataSet.Reported_As.str.contains('SEXUAL') == True), 'Offense'] = 'SEXUAL'
    dataSet.loc[(dataSet.Offense.str.contains('RAPE') == True), 'Offense']= 'SEXUAL'
    
    #Both Homicide and manslaughter are included as murder
    dataSet.loc[(dataSet.Offense.str.contains('HOMICIDE') == True), 'Offense']= 'MURDER'
    dataSet.loc[(dataSet.Offense.str.contains('MANSLAUGHTER') == True), 'Offense']= 'MURDER'
    
    #Combines all offenses with the same crime label as such
    #Also sets those crimes to forcible 
    for x in combine:
        dataSet.loc[(dataSet.Offense.str.contains('NO '+ x) == True), 'Offense']= 'N'
        dataSet.loc[(dataSet.Offense.str.contains(x) == True), 'Offense']= x
        dataSet.loc[dataSet.Offense == x, 'Forcible']= 'Y'
    
    #Removes all non forcible crimes and crimes without victims
    dataSet = dataSet[dataSet.Forcible == 'Y']
    dataSet = dataSet[dataSet.Victim_Gender.notna()]
    
    return dataSet

def breakdown(dataSet, crime_labels):
    
    #Prints out each crime breakdown
    for x in crime_labels:
        sum = len(dataSet[dataSet.Offense == x])
        print(x + (10 - len(x)) * " " + str(sum))


In [2]:
raw_data = pd.read_csv('Data/ch_crime.csv')

#Take out unnecessary variables
formated_data = raw_data[['X','Y', 'Offense', 'Date_of_Occurrence', 'Forcible', 'Weapon_Description', 'Victim_Gender', 'Reported_As', 'Victim_Age']]
formated_data = formated_data[formated_data.Victim_Gender.notna()]

formated_data.loc[formated_data.Victim_Gender == formated_data['Victim_Gender'][0], 'Gender'] = 1
formated_data.loc[formated_data.Gender != formated_data.Gender.notna(), 'Gender'] = 0
formated_data = formated_data.loc[formated_data.Victim_Age.notna()] 

#Rename X and Y
formated_data = formated_data.rename(columns = {'X':'Longitude','Y':'Latitude'})
formated_data

Unnamed: 0,Longitude,Latitude,Offense,Date_of_Occurrence,Forcible,Weapon_Description,Victim_Gender,Reported_As,Victim_Age,Gender
0,-79.032123,35.932317,B&E RESIDENCE-FORCE,2010/02/16 16:30:00+00,Y,,F,,29.0,1.0
3,-79.052878,35.914568,LARCENY- ALL OTHER,2010/02/17 02:22:00+00,N,,F,,22.0,1.0
6,-79.033691,35.928124,COMMUNICATING THREATS -INTIMIDATION NON PHYSI...,2010/02/17 08:10:00+00,N,,M,,44.0,0.0
11,-79.055184,35.961464,DOMESTIC DISTURBANCE/NO ASSAULT,2010/02/17 14:22:00+00,N,,F,,30.0,1.0
12,-79.020974,35.934196,B&E RESIDENCE-FORCE,2010/02/17 10:00:00+00,Y,,M,,62.0,0.0
...,...,...,...,...,...,...,...,...,...,...
81615,-79.028792,35.931816,HARASSMENT,2021/03/04 16:30:00+00,N,NONE,M,HARASSMENT/STAL,35.0,0.0
81620,-79.009354,35.939365,REQUEST INFO FOR 50B,2021/03/04 19:00:00+00,,,F,PUBLIC SERVICE,27.0,1.0
81621,-79.023598,35.917927,CIVIL DISPUTE,2021/03/04 14:12:00+00,,NONE,F,MENTAL DISORDER,86.0,1.0
81628,-79.061256,35.911716,B&E LARCENY F/VEHICLE,2021/03/05 08:15:00+00,,,M,THEFT/LARCENY,35.0,0.0


In [3]:
# split column and add new columns to formated_data

formated_data[['Date', 'Time']] = formated_data['Date_of_Occurrence'].str.split(' ', expand=True)
formated_data['Time'] = formated_data['Time'].str[:-9]
formated_data['Time'] = formated_data['Time'].astype(float)

In [4]:
#Combines like crimes and removed everything else

#crime_labels = ['SEXUAL', 'ASSAULT', 'VANDALISM', 'B&E', 'LACENY',  'ROBBERY', 'MURDER', 'BURGLARY']
crime_labels = ['SEXUAL', 'ASSAULT', 'VANDALISM', 'B&E', 'LARCENY']
combined_data = combine(formated_data, crime_labels)

breakdown(combined_data, crime_labels)

SEXUAL    472
ASSAULT   2182
VANDALISM 1631
B&E       5655
LARCENY   4717


In [5]:
#Sorts data by Geographic location
combined_data = combined_data.sort_values(['Longitude', 'Latitude'])

#Resets mixed index
cleanSet = combined_data.reset_index()

#Removes all no longer nessassary infomation
cleanSet = cleanSet[['Longitude','Latitude', 'Offense', 'Time', 'Gender', 'Victim_Age']]

cleanSet

Unnamed: 0,Longitude,Latitude,Offense,Time,Gender,Victim_Age
0,-79.185550,35.922087,B&E,14.0,0.0,63.0
1,-79.185550,35.922087,B&E,14.0,0.0,33.0
2,-79.185501,35.922094,VANDALISM,11.0,1.0,33.0
3,-79.148652,35.926747,LARCENY,13.0,0.0,26.0
4,-79.143900,35.926012,SEXUAL,15.0,1.0,56.0
...,...,...,...,...,...,...
14652,-78.974294,35.915845,B&E,11.0,1.0,32.0
14653,-78.962904,35.967732,LARCENY,0.0,0.0,61.0
14654,-78.958768,35.919429,LARCENY,20.0,0.0,33.0
14655,-78.935082,35.979530,B&E,3.0,1.0,19.0
