In [2]:
import os
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd

# Combining Data Files based on category

In [3]:
BASE_DIR = os.path.abspath(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'data')
crimes_outcomes_stopnsearch_dir = os.path.join(DATA_DIR, 'curated_crimes_outcomes')
use_of_force_dir = os.path.join(DATA_DIR, 'use_of_force')
curated_data_dir = os.path.join(DATA_DIR, 'curated_data')

if not os.path.exists(curated_data_dir):
    os.makedirs(curated_data_dir)

In [3]:
%%time

#* ====================================================================
#* === Data Aggregation for Course provided data ===
#* ====================================================================

stop_and_search_files, outcome_files, crime_files = [], [], []

# Collect all relevant CSV filenames
print("\nCurating all relevant CSV filenames...")
for root, dirs, files in tqdm(os.walk(crimes_outcomes_stopnsearch_dir)):
    for file in files:
        if file.endswith("stop-and-search.csv"):
            stop_and_search_files.append(os.path.join(root, file))
        elif file.endswith("outcomes.csv"):
            outcome_files.append(os.path.join(root, file))
        elif file.endswith("street.csv"):
            crime_files.append(os.path.join(root, file))
        else:
            print(f"Unknown file category: {file}")

# Print Statistics
print(f"\nNumber of Stop and Search files: {len(stop_and_search_files)}")
print(f"Number of Outcome files: {len(outcome_files)}")
print(f"Number of Crime files: {len(crime_files)}")

# Combine all files into one DataFrame
print("\nCombining files into single DataFrame(s) based on category...")

# print("\nCombining Stop and Search files...")
# stop_and_search_df = pd.concat((pd.read_csv(file) for file in tqdm(stop_and_search_files)), ignore_index=True)
# stop_and_search_df.to_csv(os.path.join(curated_data_dir, 'course_stop_and_search.csv'), index=False)
# print('Stop and Search DataFrame saved as CSV file')
# print("\nStop and Search DataFrame Info:")
# print(stop_and_search_df.info())

# del stop_and_search_df

print("\nCombining Crime files...")
crime_df = pd.concat((pd.read_csv(file) for file in tqdm(crime_files)), ignore_index=True)
crime_df = crime_df[crime_df['Falls within'] == 'Metropolitan Police Service']
crime_df.to_csv(os.path.join(curated_data_dir, 'course_crime.csv'), index=False)
print('Crime DataFrame saved as CSV file')
print("\nCrime DataFrame Info:")
print(crime_df.info())

del crime_df

print("\nCombining Outcome files...")
outcome_df = pd.concat((pd.read_csv(file) for file in tqdm(outcome_files)), ignore_index=True)
outcome_df = outcome_df[outcome_df['Falls within'] == 'Metropolitan Police Service']
outcome_df.to_csv(os.path.join(curated_data_dir, 'course_outcome.csv'), index=False)
print('Outcome DataFrame saved as CSV file')
print("\nOutcome DataFrame Info:")
print(outcome_df.info())

del outcome_df


Curating all relevant CSV filenames...


61it [00:00, 5174.91it/s]



Number of Stop and Search files: 2568
Number of Outcome files: 2561
Number of Crime files: 2656

Combining files into single DataFrame(s) based on category...

Combining Crime files...


100%|██████████| 2656/2656 [01:04<00:00, 41.29it/s]


Crime DataFrame saved as CSV file

Crime DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 5380493 entries, 229729 to 31907449
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime ID               object 
 1   Month                  object 
 2   Reported by            object 
 3   Falls within           object 
 4   Longitude              float64
 5   Latitude               float64
 6   Location               object 
 7   LSOA code              object 
 8   LSOA name              object 
 9   Crime type             object 
 10  Last outcome category  object 
 11  Context                float64
dtypes: float64(3), object(9)
memory usage: 533.6+ MB
None

Combining Outcome files...


100%|██████████| 2561/2561 [00:41<00:00, 61.13it/s]


Outcome DataFrame saved as CSV file

Outcome DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2759652 entries, 109482 to 21661168
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Crime ID      object 
 1   Month         object 
 2   Reported by   object 
 3   Falls within  object 
 4   Longitude     float64
 5   Latitude      float64
 6   Location      object 
 7   LSOA code     object 
 8   LSOA name     object 
 9   Outcome type  object 
dtypes: float64(2), object(8)
memory usage: 231.6+ MB
None
CPU times: user 2min 7s, sys: 17.2 s, total: 2min 25s
Wall time: 2min 28s


In [6]:
# Read the crimes and outcomes data and merge on 'Crime ID'
print("\nCombining Crime and Outcome files...")
crimes_df = pd.read_csv(os.path.join(curated_data_dir, 'course_crime.csv'))
crimes_df.drop('Last outcome category', axis=1)
outcomes_df = pd.read_csv(os.path.join(curated_data_dir, 'course_outcome.csv'))
outcomes_df = outcomes_df[['Crime ID', 'Outcome type']]
outcomes_df.rename(columns={'Outcome type': 'Last Outcome Category'})

crimes_outcomes_df = pd.merge(crimes_df, outcomes_df, on='Crime ID', how='left')
crimes_outcomes_df.drop_duplicates(inplace=True)
crimes_outcomes_df = crimes_outcomes_df[~crimes_outcomes_df['Crime ID'].isnull()]
crimes_outcomes_df = crimes_outcomes_df[~crimes_outcomes_df['Crime ID'].isna()]
crimes_outcomes_df.to_csv(os.path.join(curated_data_dir, 'course_crimes_outcomes.csv'), index=False)

print('Crimes and Outcomes DataFrame saved as CSV file')


Combining Crime and Outcome files...


In [None]:
# crimes_outcomes_df = pd.merge(crimes_df, outcomes_df, on='Crime ID')
# crimes_outcomes_df.to_csv(os.path.join(curated_data_dir, 'course_crimes_outcomes.csv'), index=False)

## Combining Use of Force Data Files

In [23]:
useful_columns = [
    "IncidentDate", "IncidentTime", "Incident Location: Street/Highway", "Incident Location: Public Transport", "Incident Location: Retail Premises", "Incident Location: Open ground (e.g. park, car park, field)", "Incident Location: Licensed Premises", "Incident Location: Sports or Event Stadia", "Incident Location: Hospital/A&E (non-mental-health setting)", "Incident Location: Mental Health Setting", "Incident Location: Police vehicle with prisoner handling cage", "Incident Location: Police vehicle without prisoner handling cage", "Incident Location: Dwelling", "Incident Location: Police station (excluding custody block)", "Incident Location: Custody Block", "Incident Location: Ambulance", "Incident Location: Other", "Borough", "PrimaryConduct", "AssaultedBySubject", "ThreatenedWithWeapon", "AssaultedWithWeapon", "Impact Factor: Possesion of a weapon", "Impact Factor: Alcohol", "Impact Factor: Drugs", "Impact Factor: Mental Health", "Impact Factor: Prior Knowledge", "Impact Factor: Size/Gender/Build", "Impact Factor: Acute Behavioural Disorder", "Impact Factor: Crowd", "Impact Factor: Other", "Reason for Force: Protect self", "Reason for Force: Protect Public", "Reason for Force: Protect Subject", "Reason for Force: Protect Other Officers", "Reason for Force: Prevent Offence", "Reason for Force: Secure Evidence", "Reason for Force: Effect Search", "Reason for Force: Effect Arrest", "Reason for Force: Method of Entry", "Reason for Force: Remove Handcuffs", "Reason for Force: Prevent Harm", "Reason for Force: Prevent Escape", "Reason for Force: Other", "MainDuty", "Firearms Aimed", "Firearms Fired", "SubjectAge", "SubjectGender", "SubjectEthnicity", "PhysicalDisability", "MentalDisability", "StaffInjured", "StaffInjuryIntentional", "StaffInjuryLevel", "StaffMedProvided", "SubjectInjured", "SubjectMedOffered", "SubjectMedProvided", "Outcome: Made off/escaped", "Outcome: Arrested", "Outcome: Hospitalised", "Outcome: Detained - Mental Health Act", "Outcome: Fatality", "Outcome: Other", 
]

In [26]:
# go through each of the use of force files, and append the relevant columns to a single dataframe
use_of_force_files = os.listdir(use_of_force_dir)

use_of_force_df = pd.DataFrame(columns=useful_columns)

for file in tqdm(use_of_force_files):
    df = pd.read_csv(os.path.join(use_of_force_dir, file))
    df = df[useful_columns]
    use_of_force_df = pd.concat([use_of_force_df, df], ignore_index=True)

use_of_force_df.to_csv(os.path.join(curated_data_dir, 'curated_use_of_force.csv'), index=False)

  df = pd.read_csv(os.path.join(use_of_force_dir, file))
  df = pd.read_csv(os.path.join(use_of_force_dir, file))
  df = pd.read_csv(os.path.join(use_of_force_dir, file))
100%|██████████| 3/3 [00:06<00:00,  2.05s/it]


# LEGACY CODE FROM HERE ON OUT

In [None]:
# #* ====================================================================
# #* === Data Aggregation for "London Police Data 2014-2017" ===
# #* ====================================================================

# # Data Source: https://www.kaggle.com/datasets/sohier/london-police-records

# london_police_data_dir = os.path.join(DATA_DIR, 'Kaggle_London_Police_Data_2014-2017')
# street_crime_df = pd.read_csv(os.path.join(london_police_data_dir, 'london-street.csv'))

# # Get columns from course provided data for consistency
# # course_street_crimes = pd.read_csv(os.path.join(curated_data_dir, 'crime.csv')).columns
# # course_outcomes = pd.read_csv(os.path.join(curated_data_dir, 'outcome.csv')).columns
# course_street_crimes = pd.read_csv(os.path.join(curated_data_dir, 'course_crime.csv')).columns
# course_outcomes = pd.read_csv(os.path.join(curated_data_dir, 'course_outcome.csv')).columns

# kaggle_2014_2017_street_crime_df = street_crime_df[course_street_crimes]

# kaggle_2014_2017_street_crime_df = street_crime_df[street_crime_df['Falls within'] == 'Metropolitan Police Service'].reset_index(drop=True)

# kaggle_2014_2017_street_crime_df.to_csv(os.path.join(curated_data_dir, 'kaggle_2014_2017_crime.csv'), index=False)

In [None]:
# #* ====================================================================
# #* === Data Aggregation for "UK Police Street Crime 2018-2021" ===
# #* ====================================================================

# # Crimes Data Source: https://www.kaggle.com/datasets/tantable/all-uk-police-street-crime-102018-to-092021
# # Outcomes Data Source(s):

# # Also need to filter out data that is not from London (Metropolitan Police Service)
# # No outcome data available for this dataset

# kaggle_2018_2021 = dd.read_csv(os.path.join(DATA_DIR, "UK_Police_Street_Crime_2018-10-01_to_2021_09_31.csv"))
# kaggle_2018_2021 = kaggle_2018_2021[kaggle_2018_2021['Falls within'] == 'Metropolitan Police Service'].compute()
# kaggle_2018_2021 = kaggle_2018_2021[~kaggle_2018_2021['Crime ID'].isna()]

In [None]:
# #* ====================================================================
# #* === Sanity Check ===
# #* ====================================================================

# # Go through each dataset, remove rows that are duplicates
# # print the columns for each dataset and ensure they are the same 

# # Course Provided Data
# course_crime = pd.read_csv(os.path.join(curated_data_dir, 'course_crime.csv'))

# print("\nCourse Provided Data:")
# print("Course Crime Columns:")
# print(course_crime.columns)

# # Kaggle 2014-2017 Data
# kaggle_2014_2017_crime = pd.read_csv(os.path.join(curated_data_dir, 'kaggle_2014_2017_crime.csv'))

# print("\nKaggle 2014-2017 Data:")
# print("Kaggle 2014-2017 Crime Columns:")
# print(kaggle_2014_2017_crime.columns)

# # Kaggle 2018-2021 Data
# kaggle_2018_2021 
# print("\nKaggle 2018-2021 Data:")
# print("Kaggle 2018-2021 Crime Columns:")
# print(kaggle_2018_2021.columns)

In [None]:
# #* ====================================================================
# #* === Final Aggregation - combining all street crimes and outcomes ===
# #* ====================================================================

# #! NOTE: No crimes data available from July 2017 to September 2018

# # minor cleaning
# kaggle_2018_2021 = kaggle_2018_2021[['Crime ID', 'Month', 'Reported by', 'Falls within',
#        'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name',
#        'Crime type', 'Last outcome category']]

# final_crimes = pd.concat([course_crime, kaggle_2014_2017_crime, kaggle_2018_2021], ignore_index=True)

# # remove duplicate rows
# final_crimes = final_crimes.drop_duplicates()

# # remove columns with no crime ID
# final_crimes = final_crimes[~final_crimes['Crime ID'].isna()]

In [None]:
# # Export
# final_crimes.to_csv(os.path.join(curated_data_dir, 'final_crimes.csv'), index=False)