# Toronto Crime Data Cleaning

 The cleaning process of this data is meant specifically for Toronto Crimes Data. A class was made to get all the different crime types to conform to a single dataframe structure.

In [1]:
from cleaning_tools.dataframe_tools import TorontoCrimeDataCleaner
from pathlib import Path
import os

dataset_folder = Path('../../Data/datasets/')
export_folder = Path("../../Data/cleaned_data/cleaned_data")

In [2]:
# Create dictionary that contains the crime types and paths to each crime dataset to be analyzed

crimes_dict = {
    "assault": os.path.join(dataset_folder, "Assault_Open_Data.csv"),
    "auto_theft": os.path.join(dataset_folder, "Auto_Theft_Open_Data.csv"),
    "break_and_enter": os.path.join(dataset_folder, "Break_and_Enter_Open_Data.csv"),
    "robbery": os.path.join(dataset_folder, "Robbery_Open_Data.csv"),
    "shooting": os.path.join(dataset_folder, "Shooting_and_Firearm_Discharges_Open_Data.csv"),
    "theft_from_motor_vehicle": os.path.join(dataset_folder, "Theft_From_Motor_Vehicle_Open_Data.csv"),
    "theft_over": os.path.join(dataset_folder, "Theft_Over_Open_Data.csv"),
    # "traffic_collision": os.path.join(dataset_folder, "Traffic_Collisions_(ASR-T-TBL-001).csv"),
    "bicycle_theft": os.path.join(dataset_folder, "Bicycle_Thefts_Open_Data.csv"),
    "homicide": os.path.join(dataset_folder, "Homicides_Open_Data_ASR_RC_TBL_002.csv")
}

crimes_dict

{'assault': '../../Data/datasets/Assault_Open_Data.csv',
 'auto_theft': '../../Data/datasets/Auto_Theft_Open_Data.csv',
 'break_and_enter': '../../Data/datasets/Break_and_Enter_Open_Data.csv',
 'robbery': '../../Data/datasets/Robbery_Open_Data.csv',
 'shooting': '../../Data/datasets/Shooting_and_Firearm_Discharges_Open_Data.csv',
 'theft_from_motor_vehicle': '../../Data/datasets/Theft_From_Motor_Vehicle_Open_Data.csv',
 'theft_over': '../../Data/datasets/Theft_Over_Open_Data.csv',
 'bicycle_theft': '../../Data/datasets/Bicycle_Thefts_Open_Data.csv',
 'homicide': '../../Data/datasets/Homicides_Open_Data_ASR_RC_TBL_002.csv'}

In [3]:
# Create TorontoCrimeDataCleaner class object, with the min and max date ranges passed as arguments
min_year = 2015
max_year = 2018

toronto_data_cleaner = TorontoCrimeDataCleaner(min_year=min_year, max_year=max_year)

In [4]:
# Add crimes_dict to TorontoCrimeDataCleaner class object to create and normalize the dataframes

toronto_data_cleaner.csv_dict_to_dataframes(crimes_dict)

"../../Data/datasets/Assault_Open_Data.csv" exists
"../../Data/datasets/Auto_Theft_Open_Data.csv" exists
"../../Data/datasets/Break_and_Enter_Open_Data.csv" exists
"../../Data/datasets/Robbery_Open_Data.csv" exists
"../../Data/datasets/Shooting_and_Firearm_Discharges_Open_Data.csv" exists
"../../Data/datasets/Theft_From_Motor_Vehicle_Open_Data.csv" exists
"../../Data/datasets/Theft_Over_Open_Data.csv" exists
"../../Data/datasets/Bicycle_Thefts_Open_Data.csv" exists
"../../Data/datasets/Homicides_Open_Data_ASR_RC_TBL_002.csv" exists
--------------------------
Adding "assault" data from filepath "../../Data/datasets/Assault_Open_Data.csv" to new DataFrame
Successfully loaded CSV to Pandas Dataframe
Starting to clean the data...
Normalizing column data...
Converting appropriate date data to integers...
Starting to clear whitespaces from object columns...
Filtering years from 2015 to 2018...
Creating 'DATE' column out of date columns...
Reformatting column names...
Successfully cleaned dat

In [5]:
# Use the merge function from the TorontoCrimeDataCleaner class object to create one Dataframe
all_data_cleaned_df = toronto_data_cleaner.merge_all_dataframes()

In [6]:
all_data_cleaned_df.head()

Unnamed: 0,event_unique_id,crime,occ_year,occ_month,occ_day,occ_dow,occ_hour,premises_type,hood_140,neighbourhood_140,long_wgs84,lat_wgs84,date
0,GO-20155154,assault,2015,January,1,Thursday,23,House,115,Mount Dennis (115),-79.504668,43.693238,2015-01-01
1,GO-20151233,assault,2015,January,1,Thursday,3,Commercial,77,Waterfront Communities-The Island (77),-79.392855,43.647315,2015-01-01
2,GO-2015862,assault,2015,January,1,Thursday,2,Commercial,77,Waterfront Communities-The Island (77),-79.3877,43.649776,2015-01-01
3,GO-2015182,assault,2015,January,1,Thursday,0,Commercial,1,West Humber-Clairville (1),-79.595562,43.686751,2015-01-01
4,GO-2015276,assault,2015,January,1,Thursday,0,Outside,77,Waterfront Communities-The Island (77),-79.400096,43.645835,2015-01-01


In [7]:
all_data_cleaned_df["event_unique_id"].value_counts()

GO-20155154       1
GO-2017323658     1
GO-2017297070     1
GO-2017303305     1
GO-2017304151     1
                 ..
GO-20181286110    1
GO-20181285543    1
GO-20181285120    1
GO-20181287249    1
GO-20182242344    1
Name: event_unique_id, Length: 166512, dtype: int64

In [8]:
# Display all created dataframes by keys to determine which dataframes you'd like to export
toronto_data_cleaner.df_dict.keys()

dict_keys(['assault', 'auto_theft', 'break_and_enter', 'robbery', 'shooting', 'theft_from_motor_vehicle', 'theft_over', 'bicycle_theft', 'homicide', 'all_data_merged'])

In [9]:
# Use the export_selective_cleaned() function to save selected cleaned dataframe to CSV
toronto_data_cleaner.export_selective_cleaned(df_list=['all_data_merged'], folder_name=str(export_folder))

Successfully saved all_data_merged_cleaned_2015_2018.csv!


## Adding Data From 2019

In [10]:
# Repeat the above for just 2019
# Create TorontoCrimeDataCleaner class object, with the min and max date ranges passed as arguments
min_year = 2019
max_year = 2019

toronto_data_cleaner = TorontoCrimeDataCleaner(min_year=min_year, max_year=max_year)

In [11]:
# Add crimes_dict to TorontoCrimeDataCleaner class object to create and normalize the dataframes

toronto_data_cleaner.csv_dict_to_dataframes(crimes_dict)

"../../Data/datasets/Assault_Open_Data.csv" exists
"../../Data/datasets/Auto_Theft_Open_Data.csv" exists
"../../Data/datasets/Break_and_Enter_Open_Data.csv" exists
"../../Data/datasets/Robbery_Open_Data.csv" exists
"../../Data/datasets/Shooting_and_Firearm_Discharges_Open_Data.csv" exists
"../../Data/datasets/Theft_From_Motor_Vehicle_Open_Data.csv" exists
"../../Data/datasets/Theft_Over_Open_Data.csv" exists
"../../Data/datasets/Bicycle_Thefts_Open_Data.csv" exists
"../../Data/datasets/Homicides_Open_Data_ASR_RC_TBL_002.csv" exists
--------------------------
Adding "assault" data from filepath "../../Data/datasets/Assault_Open_Data.csv" to new DataFrame
Successfully loaded CSV to Pandas Dataframe
Starting to clean the data...
Normalizing column data...
Converting appropriate date data to integers...
Starting to clear whitespaces from object columns...
Filtering years from 2019 to 2019...
Creating 'DATE' column out of date columns...
Reformatting column names...
Successfully cleaned dat

In [12]:
# Use the merge function from the TorontoCrimeDataCleaner class object to create one Dataframe
all_data_cleaned_df = toronto_data_cleaner.merge_all_dataframes()

In [13]:
all_data_cleaned_df.head()

Unnamed: 0,event_unique_id,crime,occ_year,occ_month,occ_day,occ_dow,occ_hour,premises_type,hood_140,neighbourhood_140,long_wgs84,lat_wgs84,date
0,GO-20192231,assault,2019,January,1,Tuesday,10,Other,82,Niagara (82),-79.419818,43.632682,2019-01-01
1,GO-20192434,assault,2019,January,1,Tuesday,7,House,113,Weston (113),-79.525443,43.703001,2019-01-01
2,GO-20193767,assault,2019,January,1,Tuesday,16,Transit,67,Playter Estates-Danforth (67),-79.358099,43.676764,2019-01-01
3,GO-2019176,assault,2019,January,1,Tuesday,0,Outside,95,Annex (95),-79.394817,43.67038,2019-01-01
4,GO-2019521,assault,2019,January,1,Tuesday,1,Commercial,85,South Parkdale (85),-79.425333,43.63734,2019-01-01


In [14]:
all_data_cleaned_df["event_unique_id"].value_counts()

GO-20192231       1
GO-2019703990     1
GO-2019694054     1
GO-2019693802     1
GO-2019694759     1
                 ..
GO-20192325551    1
GO-20192321250    1
GO-20192321319    1
GO-20192321311    1
GO-20192470898    1
Name: event_unique_id, Length: 47523, dtype: int64

In [15]:
# Display all created dataframes by keys to determine which dataframes you'd like to export
toronto_data_cleaner.df_dict.keys()

dict_keys(['assault', 'auto_theft', 'break_and_enter', 'robbery', 'shooting', 'theft_from_motor_vehicle', 'theft_over', 'bicycle_theft', 'homicide', 'all_data_merged'])

In [16]:
# Use the export_selective_cleaned() function to save selected cleaned dataframe to CSV
toronto_data_cleaner.export_selective_cleaned(df_list=['all_data_merged'], folder_name=str(export_folder))

Successfully saved all_data_merged_cleaned_2019_2019.csv!
