In [None]:
import pandas as pd
import re
import requests
from tqdm.autonotebook import tqdm
import json
from pathlib import Path
import pickle
import os
import shutil

For purposes of this code please go to the: https://data.police.uk/data/archive/ and upload following datasets:
(or any other combination of datasets such that you would include all months from 2016-2023)
- December 2018
- December 2021
- April 2024

Make them regular folders (meaning if they are zip extract them from zip, get the folder to the downloads directory and you can delete the zip)
after this file is done running and you got all the needed files you can delete the folders with dates which you uploaded since they will be empty

Also download the PAS_borough file from https://data.london.gov.uk/dataset/mopac-surveys and after running the code just put it in the folder where you have the rest of your data

### Downloading the data

- in dates provide names of the files if they differ main point is to get as many files as possible with lowest amount of downloads.

In [None]:
# Define your Path variable to the new folder where project would be saved
PATH = '/Users/ansat.omurzakov/Desktop/DBLchik/' # give a path where you would love to store the newly uploaded files

In [None]:
# names of folders
dates = ['2018-12', '2021-12', '2024-04']

# provide path where all of the folders lie (please place them on the same level)
original_folders = [f'/Users/ansat.omurzakov/Downloads/{date}' for date in dates]

# Create the new folder if it doesn't exist
os.makedirs(PATH, exist_ok=True)

# Function to generate a unique destination path
def get_unique_path(dst_folder, name):
    counter = 1
    base, extension = os.path.splitext(name)
    unique_name = name
    unique_path = os.path.join(dst_folder, unique_name)

    while os.path.exists(unique_path):
        unique_name = f"{base}_{counter}{extension}"
        unique_path = os.path.join(dst_folder, unique_name)
        counter += 1

    return unique_path

# Loop through each of the original folders
for folder in original_folders:
    # List all items in the current folder
    items = os.listdir(folder)
    
    for item in items:
        # Construct full item path
        item_path = os.path.join(folder, item)
        
        # Get a unique destination path
        destination_path = get_unique_path(PATH, item)
        
        # Move each item to the new folder
        shutil.move(item_path, destination_path)

print(f"All contents moved to {PATH}")

In [None]:
# Deleting copies of files
for file in os.listdir(PATH):
    if re.search(r'\d{4}-\d{2}_1', file):
        shutil.rmtree(PATH +  file)

In [None]:
def delete_files(PATH):
    # getting dates from the CSV file
    dates_list = os.listdir(PATH)
    for date in dates_list:
        if not re.match(r'\d{4}-\d{2}', date):
            dates_list.remove(date)

    dates_list = sorted(dates_list)

    # deleting irrelevant files from directories
    for date in tqdm(dates_list, total = len(dates_list)):
        for file in os.listdir(PATH + date):
            if not re.search(r'(metropolitan-street)|(metropolitan-stop-and-search)', file):
                os.remove(PATH + date + '/' + file)
    return 'Done!'

def save_data(category, PATH):
    if len(category) != 0:
        all_data = pd.DataFrame()
        for f in tqdm(category, total = len(category)):
            df = pd.read_csv(PATH + f[:7] + '/' + f)
            all_data = pd.concat([all_data,df],ignore_index=True)
            all_data = all_data.drop_duplicates(keep = 'first')
        all_data.to_csv(PATH + re.split(r'(\d{2}-)', category[0])[-1], index=False)
    else: return f'List {category} is empty'

def get_file_list(department, type, PATH):
    lst = []
    for _, _, files in os.walk(PATH, topdown = True):
        for filename in files:
            if f'{department}-{type}' in filename:
                lst.append(filename)
    return lst

## Deleting irrelevant files from directories (neighborhood, data)

In [None]:
delete_files(PATH)

## Getting file names from which to retrieve csvs

In [None]:
# Metropolitan_files
metropolitan_street = get_file_list('metropolitan','street', PATH)
metropolitan_sas= get_file_list('metropolitan','stop-and-search', PATH)

## Concatenating files per category and saving them into one csv file (takes quite some portion of time due to the fact that we are uploading a lot of files)

In [None]:
# Metropolitan Data
save_data(metropolitan_street, PATH)
print('[INFO] Combining datasets to Metropolitan-street.csv is done')
save_data(metropolitan_sas, PATH)
print('[INFO] Combining datasets to Metropolitan-stop-and-search.csv is done')

In [None]:
# remove non-concatenated files
for file in os.listdir(PATH):
    if re.search(r'\d{4}-\d{2}', file):
        shutil.rmtree(PATH +  file)

# Getting all neighbourhoods

In [None]:
locate_neighbourhood_link = 'https://data.police.uk/api/metropolitan/neighbourhoods'
met_neighbourhoods = requests.get(locate_neighbourhood_link).json()

In [None]:
neighbourhoods = [neighbourhood['id'] for neighbourhood in met_neighbourhoods]

### Classify neighbourhoods to boroughs

In [None]:
dic = {}
for value in tqdm(neighbourhoods, total = len(neighbourhoods)):
    try:
        data = requests.get(f'https://findthatpostcode.uk/areas/{value}.json').json()['included']
        name = data[6]['attributes']['name']
        if name in dic.keys():
            dic[name].append(value)
        else:
            dic[name] = [value]
    except Exception as e:
        data = requests.get(f'https://findthatpostcode.uk/areas/{value[:-1]}.json').json()['included']
        name = data[6]['attributes']['name']
        if name in dic.keys():
            dic[name].append(value)
        else:
            dic[name] = [value]
print('Done')

In [None]:
with open(PATH + 'boroughs_neighbourhoods.json', 'w') as file:
    json.dump(dic, file)

In [None]:
# Load JSON data from file
with open(PATH + 'boroughs_neighbourhoods.json', 'r') as file:
    json_data = json.load(file)

# Convert JSON data to a DataFrame
boroughs_data = []
for borough, area_codes in json_data.items():
    for area_code in area_codes:
        boroughs_data.append({'Borough': borough, 'Area Code': area_code})

boroughs_neighbours = pd.DataFrame(boroughs_data)
boroughs_neighbours

In [None]:
b_n_b = {}

for borough in tqdm(boroughs_neighbours['Borough'].unique(), total=len(boroughs_neighbours['Borough'].unique())):
    if borough not in b_n_b:
        b_n_b[borough] = []
    for neighbourhood in boroughs_neighbours[boroughs_neighbours['Borough'] == borough]['Area Code'].unique():
        boundary_points = [(item['latitude'], item['longitude']) for item in requests.get(f'https://data.police.uk/api/metropolitan/{neighbourhood}/boundary').json()]
        b_n_b[borough].append({neighbourhood: boundary_points})
    print(f'[INFO] Added neighbourhoods and their boundaries for {borough}')

In [None]:
# Specify the file path
json_file_path = PATH + "/neighborhood_boundaries.json"

# Save the dictionary as a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(b_n_b, json_file)

print("JSON file saved successfully.")

In [None]:
path_variable = Path(PATH)

# Save the variable to a file
with open('path_variable.pkl', 'wb') as f:
    pickle.dump(path_variable, f)
    print('PATH variable saved successfully')