In [None]:
import os
import pandas as pd
import sqlite3
from tqdm import tqdm

root_folder = 'C:/Users/20232075/Desktop/London Police Data'
db_path = 'crime_data.db'
batch_size = 2000

required_columns = [
    'Crime ID', 'Month', 'Reported by', 'Falls within',
    'Longitude', 'Latitude', 'Location',
    'LSOA code', 'Crime type', 'Last outcome category', 'Context'
]

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("PRAGMA journal_mode=WAL;")

cursor.execute('''
    CREATE TABLE IF NOT EXISTS crime (
        crimeID TEXT PRIMARY KEY,
        Month TEXT,
        Reporter TEXT,
        Jurisdiction TEXT,
        Longitude REAL,
        Latitude REAL,
        Location TEXT,
        LSOA_code TEXT,
        Type TEXT,
        Outcome TEXT,
        Context TEXT
    )
''')
conn.commit()

insert_query = '''
    INSERT OR IGNORE INTO crime (
        crimeID, Month, Reporter, Jurisdiction, Longitude, Latitude,
        Location, LSOA_code, Type, Outcome, Context
    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
'''

all_files = []
for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith('.csv'):
            all_files.append(os.path.join(subdir, file))

batch = []
file_count = 0
inserted_rows = 0

for file_path in tqdm(all_files, desc="Processing files", unit="file"):
    name_without_ext = file_path[:-4]
    if name_without_ext.lower().endswith('-street'):
        df = pd.read_csv(file_path)
        for col in required_columns:
            if col not in df.columns:
                df[col] = None

        df = df.rename(columns={
            'Crime ID': 'crimeID',
            'Month': 'Month',
            'Reported by': 'Reporter',
            'Falls within': 'Jurisdiction',
            'Longitude': 'Longitude',
            'Latitude': 'Latitude',
            'Location': 'Location',
            'LSOA code': 'LSOA_code',
            'Crime type': 'Type',
            'Last outcome category': 'Outcome',
            'Context': 'Context'
        })

        df = df[['crimeID', 'Month', 'Reporter', 'Jurisdiction', 'Longitude', 'Latitude',
                 'Location', 'LSOA_code', 'Type', 'Outcome', 'Context']]

        records = list(df.itertuples(index=False, name=None))

        for record in tqdm(records, desc=f"Inserting {os.path.basename(file_path)}", leave=False):
            batch.append(record)
            if len(batch) >= batch_size:
                cursor.executemany(insert_query, batch)
                conn.commit()
                inserted_rows += len(batch)
                batch = []

        file_count += 1

if batch:
    cursor.executemany(insert_query, batch)
    conn.commit()
    inserted_rows += len(batch)

conn.close()
print(f"\nInserted {inserted_rows} rows from {file_count} files.")


In [None]:
import os
import pandas as pd
import sqlite3
from tqdm import tqdm

root_folder = 'C:/Users/20232075/Desktop/London Police Data'
db_path = 'crime_data.db'
batch_size = 2000

required_columns = [
    'Type', 'Date', 'Part of a policing operation', 'Policing operation',
    'Latitude', 'Longitude', 'Gender', 'Age range',
    'Self-defined ethnicity', 'Officer-defined ethnicity',
    'Legislation', 'Object of search', 'Outcome',
    'Removal of more than just outer clothing'
]

sql_columns = [
    'Type', 'Date', 'Part_Policing_Operation', 'Policing_Operation',
    'Latitude', 'Longitude', 'Gender', 'Age_Range',
    'Self_defined_Ethnicity', 'Officer_defined_Ethnicity',
    'Legislation', 'Reason', 'Outcome', 'Clothing_Removal'
]

conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("PRAGMA journal_mode=WAL;")


cursor.execute('''
    CREATE TABLE IF NOT EXISTS operations (
        ID INTEGER PRIMARY KEY AUTOINCREMENT,
        Type TEXT,
        Date TEXT,
        Part_Policing_Operation TEXT,
        Policing_Operation TEXT,
        Latitude REAL,
        Longitude REAL,
        Gender TEXT,
        Age_Range TEXT,
        Self_defined_Ethnicity TEXT,
        Officer_defined_Ethnicity TEXT,
        Legislation TEXT,
        Reason TEXT,
        Outcome TEXT,
        Clothing_Removal TEXT
    )
''')
conn.commit()

insert_query = f'''
    INSERT INTO operations (
        {', '.join(sql_columns)}
    ) VALUES ({', '.join(['?' for _ in sql_columns])});
'''

all_files = []
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.endswith('.csv'):
            all_files.append(os.path.join(subdir, file))

batch = []
file_count = 0
inserted_rows = 0

for file_path in tqdm(all_files, desc="Processing files", unit="file"):
    name_without_ext = file_path[:-4]
    if name_without_ext.lower().endswith('-stop-and-search'):
        df = pd.read_csv(file_path)
        for col in required_columns:
            if col not in df.columns:
                df[col] = None

        if not any(col in df.columns for col in required_columns):
            continue

        for col in required_columns:
            if col not in df.columns:
                df[col] = None

        df = df.rename(columns={
            'Type': 'Type',
            'Date': 'Date',
            'Part of a policing operation': 'Part_Policing_Operation',
            'Policing operation': 'Policing_Operation',
            'Latitude': 'Latitude',
            'Longitude': 'Longitude',
            'Gender': 'Gender',
            'Age range': 'Age_Range',
            'Self-defined ethnicity': 'Self_defined_Ethnicity',
            'Officer-defined ethnicity': 'Officer_defined_Ethnicity',
            'Legislation': 'Legislation',
            'Object of search': 'Reason',
            'Outcome': 'Outcome',
            'Removal of more than just outer clothing': 'Clothing_Removal'
        })

        df = df[sql_columns]

        records = list(df.itertuples(index=False, name=None))

        for record in tqdm(records, desc=f"Inserting {os.path.basename(file_path)}", leave=False):
            batch.append(record)
            if len(batch) >= batch_size:
                cursor.executemany(insert_query, batch)
                conn.commit()
                inserted_rows += len(batch)
                batch = []

        file_count += 1
if batch:
    cursor.executemany(insert_query, batch)
    conn.commit()
    inserted_rows += len(batch)

conn.close()
print(f"\nInserted {inserted_rows} rows into 'operations' from {file_count} files.")


In [None]:
import os
import pandas as pd
import sqlite3
from tqdm import tqdm

root_folder = 'C:/Users/20232075/Desktop/London Police Data'
db_path = 'crime_data.db'
batch_size = 2000

required_columns = [
    'Crime ID', 'Month', 'Reported by', 'Falls within',
    'Longitude', 'Latitude', 'Location', 'LSOA code',
    'LSOA name', 'Outcome type'
]

sql_columns = [
    'crimeID', 'Month', 'Reporter', 'Jurisdiction',
    'Longitude', 'Latitude', 'Location', 'LSOA_code',
    'LSOA_name', 'Outcome_Type'
]


conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("PRAGMA journal_mode=WAL;")

cursor.execute('''
    CREATE TABLE IF NOT EXISTS crime_outcomes (
        crimeID TEXT PRIMARY KEY,
        Month TEXT,
        Reporter TEXT,
        Jurisdiction TEXT,
        Longitude REAL,
        Latitude REAL,
        Location TEXT,
        LSOA_code TEXT,
        LSOA_name TEXT,
        Outcome_Type TEXT
    )
''')
conn.commit()

insert_query = f'''
    INSERT OR IGNORE INTO crime_outcomes (
        {', '.join(sql_columns)}
    ) VALUES ({', '.join(['?' for _ in sql_columns])});
'''

# Collect all .csv files
all_files = []
for subdir, _, files in os.walk(root_folder):
    for file in files:
        if file.endswith('.csv'):
            all_files.append(os.path.join(subdir, file))

batch = []
file_count = 0
inserted_rows = 0

# Process files
for file_path in tqdm(all_files, desc="Processing files", unit="file"):
    name_without_ext = file_path[:-4]
    if name_without_ext.lower().endswith('-outcomes'):
        df = pd.read_csv(file_path)
        for col in required_columns:
            if col not in df.columns:
                df[col] = None
        df = df.rename(columns={
            'Crime ID': 'crimeID',
            'Month': 'Month',
            'Reported by': 'Reporter',
            'Falls within': 'Jurisdiction',
            'Longitude': 'Longitude',
            'Latitude': 'Latitude',
            'Location': 'Location',
            'LSOA code': 'LSOA_code',
            'LSOA name': 'LSOA_name',
            'Outcome type': 'Outcome_Type'
        })

        df = df[sql_columns]
        records = list(df.itertuples(index=False, name=None))

        for record in tqdm(records, desc=f"Inserting {os.path.basename(file_path)}", leave=False):
            batch.append(record)
            if len(batch) >= batch_size:
                cursor.executemany(insert_query, batch)
                conn.commit()
                inserted_rows += len(batch)
                batch = []

        file_count += 1

if batch:
    cursor.executemany(insert_query, batch)
    conn.commit()
    inserted_rows += len(batch)

conn.close()
print(f"\n✅ Inserted {inserted_rows} rows into 'crime_outcomes' from {file_count} files.")
