In [1]:
import os
from tqdm import tqdm

import pandas as pd
%load_ext cudf.pandas
import dask.dataframe as dd

import datashader
from datashader import transfer_functions as tf
from datashader.colors import Hot, viridis

# Combining Data Files based on category

In [2]:
BASE_DIR = os.path.abspath(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'data')
crimes_outcomes_stopnsearch_dir = os.path.join(DATA_DIR, 'crimes_outcomes_stopnsearch')
curated_data_dir = os.path.join(DATA_DIR, 'curated_data')

if not os.path.exists(curated_data_dir):
    os.makedirs(curated_data_dir)

In [3]:
%%time

#* ====================================================================
#* === Data Aggregation and Curation ===
#* ====================================================================

stop_and_search_files, outcome_files, crime_files = [], [], []

# Collect all relevant CSV filenames
print("\nCurating all relevant CSV filenames...")
for root, dirs, files in tqdm(os.walk(crimes_outcomes_stopnsearch_dir)):
    for file in files:
        if file.endswith("stop-and-search.csv"):
            stop_and_search_files.append(os.path.join(root, file))
        elif file.endswith("outcomes.csv"):
            outcome_files.append(os.path.join(root, file))
        elif file.endswith("street.csv"):
            crime_files.append(os.path.join(root, file))
        else:
            print(f"Unknown file category: {file}")

# Print Statistics
print(f"\nNumber of Stop and Search files: {len(stop_and_search_files)}")
print(f"Number of Outcome files: {len(outcome_files)}")
print(f"Number of Crime files: {len(crime_files)}")

# Combine all files into one DataFrame
print("\nCombining files into single DataFrame(s) based on category...")

print("\nCombining Stop and Search files...")
stop_and_search_df = pd.concat((pd.read_csv(file) for file in tqdm(stop_and_search_files)), ignore_index=True)
stop_and_search_df.to_csv(os.path.join(curated_data_dir, 'stop_and_search.csv'), index=False)
stop_and_search_df.to_parquet(os.path.join(curated_data_dir, 'stop_and_search.parquet'), index=False)
print('Stop and Search DataFrame saved as CSV and Parquet files')
print("\nStop and Search DataFrame Info:")
print(stop_and_search_df.info())

del stop_and_search_df

print("\nCombining Outcome files...")
outcome_df = pd.concat((pd.read_csv(file) for file in tqdm(outcome_files)), ignore_index=True)
outcome_df.to_csv(os.path.join(curated_data_dir, 'outcome.csv'), index=False)
outcome_df.to_parquet(os.path.join(curated_data_dir, 'outcome.parquet'), index=False)
print('Outcome DataFrame saved as CSV and Parquet files')
print("\nOutcome DataFrame Info:")
print(outcome_df.info())

del outcome_df

print("\nCombining Crime files...")
crime_df = pd.concat((pd.read_csv(file) for file in tqdm(crime_files)), ignore_index=True)
crime_df.to_csv(os.path.join(curated_data_dir, 'crime.csv'), index=False)
crime_df.to_parquet(os.path.join(curated_data_dir, 'crime.parquet'), index=False)
print('Crime DataFrame saved as CSV and Parquet files')
print("\nCrime DataFrame Info:")
print(crime_df.info())

del crime_df


Curating all relevant CSV filenames...


37it [00:00, 1608.85it/s]



Number of Stop and Search files: 1450
Number of Outcome files: 1510
Number of Crime files: 1575

Combining files into single DataFrame(s) based on category...

Combining Stop and Search files...


100%|██████████| 1450/1450 [00:15<00:00, 93.67it/s] 


Stop and Search DataFrame saved as CSV and Parquet files

Stop and Search DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493820 entries, 0 to 1493819
Data columns (total 15 columns):
 #   Column                                    Non-Null Count    Dtype  
---  ------                                    --------------    -----  
 0   Type                                      1493820 non-null  object 
 1   Date                                      1493820 non-null  object 
 2   Part of a policing operation              922495 non-null   object 
 3   Policing operation                        0 non-null        float64
 4   Latitude                                  1281002 non-null  float64
 5   Longitude                                 1281002 non-null  float64
 6   Gender                                    1430714 non-null  object 
 7   Age range                                 1291519 non-null  object 
 8   Self-defined ethnicity                    1398446 non-null  ob

100%|██████████| 1510/1510 [01:18<00:00, 19.14it/s]


Outcome DataFrame saved as CSV and Parquet files

Outcome DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14241608 entries, 0 to 14241607
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Crime ID      object 
 1   Month         object 
 2   Reported by   object 
 3   Falls within  object 
 4   Longitude     float64
 5   Latitude      float64
 6   Location      object 
 7   LSOA code     object 
 8   LSOA name     object 
 9   Outcome type  object 
dtypes: float64(2), object(8)
memory usage: 1.1+ GB
None

Combining Crime files...


100%|██████████| 1575/1575 [01:40<00:00, 15.60it/s]


Crime DataFrame saved as CSV and Parquet files

Crime DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18486818 entries, 0 to 18486817
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime ID               object 
 1   Month                  object 
 2   Reported by            object 
 3   Falls within           object 
 4   Longitude              float64
 5   Latitude               float64
 6   Location               object 
 7   LSOA code              object 
 8   LSOA name              object 
 9   Crime type             object 
 10  Last outcome category  object 
 11  Context                float64
dtypes: float64(3), object(9)
memory usage: 1.7+ GB
None
CPU times: user 9min 47s, sys: 53 s, total: 10min 40s
Wall time: 10min 47s
