In [1]:
import pandas as pd
import numpy as np 
import os
import sqlite3
import re

## Generating CSV's

### Function definitions

In [2]:
def path_files(path_directory):
    """
    Extracts all files names in a directory and its subdirectories 
    that have either metropolitan or city in the filename
    :param path_directory: path to the directory to extract files from
    :return: list of file paths
    """
    
    directory = os.listdir(path_directory)
    files = []
    
    for sub in directory:
        sub_path = os.path.join(path_directory, sub)
        if os.path.isdir(sub_path):
            files = files + path_files(sub_path)
        elif re.match('.*(metropolitan|city)-', sub_path): 
            files = files + [sub_path]
            
    return files

In [3]:
def get_csv(path, area, datatype):
    """
    Extracts csv file in given path if the path matches given area/datatype
    :param path: path to the directory to get csv
    :param area: area to consider
    :param datatype: datatype to consider (outcome, street or stop-and-search)
    :return: csv
    """
    if re.match(f'.*{area}-{datatype}', path):
        return pd.read_csv(path)

### CSV finder

In [4]:
files = path_files('./police_data_all/')

In [5]:
df_metro_street = pd.concat([get_csv(i, 'metropolitan', 'street') for i in files], 
                            sort=False, ignore_index=True)
df_metro_search = pd.concat([get_csv(i, 'metropolitan', 'stop-and-search') for i in files], 
                            sort=False, ignore_index=True)
df_metro_outcomes = pd.concat([get_csv(i, 'metropolitan', 'outcomes') for i in files], 
                            sort=False, ignore_index=True)

df_city_street = pd.concat([get_csv(i, 'city-of-london', 'street') for i in files], 
                            sort=False, ignore_index=True)
df_city_search = pd.concat([get_csv(i, 'city-of-london', 'stop-and-search') for i in files], 
                            sort=False, ignore_index=True)
df_city_outcomes = pd.concat([get_csv(i, 'city-of-london', 'outcomes') for i in files], 
                            sort=False, ignore_index=True)

## Database creation

In [6]:
conn_london = sqlite3.connect('./police_london.db')

In [7]:
df_metro_street.to_sql('METRO_STREET', conn_london, if_exists="append", index=False)
df_metro_search.to_sql('METRO_SEARCH', conn_london, if_exists="append", index=False)
df_metro_outcomes.to_sql('METRO_OUTCOMES', conn_london, if_exists="append", index=False)

df_city_street.to_sql('CITY_STREET', conn_london, if_exists="append", index=False)
df_city_search.to_sql('CITY_SEARCH', conn_london, if_exists="append", index=False)
df_city_outcomes.to_sql('CITY_OUTCOMES', conn_london, if_exists="append", index=False)

  sql.to_sql(


In [8]:
conn_london.close()