In [1]:
import pandas as pd, geopandas as gpd
import matplotlib.pyplot as pyplot, seaborn as sns
import numpy as np
from pathlib import Path

In [None]:
project_root = Path().resolve().parent
burg_file = project_root / 'data' / 'burglary' / 'burglary_2021_2025.csv'
pp_file = project_root / 'data' / 'externals' / 'clean_pp.parquet'

In [5]:
raw = pd.read_csv(burg_file)
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223514 entries, 0 to 223513
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Crime ID               223514 non-null  object 
 1   Month                  223514 non-null  object 
 2   Reported by            223514 non-null  object 
 3   Longitude              219154 non-null  float64
 4   Latitude               219154 non-null  float64
 5   Location               223514 non-null  object 
 6   LSOA code              219154 non-null  object 
 7   LSOA name              219154 non-null  object 
 8   Crime type             223514 non-null  object 
 9   Last outcome category  223514 non-null  object 
dtypes: float64(2), object(8)
memory usage: 17.1+ MB


In [6]:
print(raw.columns.tolist())

['Crime ID', 'Month', 'Reported by', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type', 'Last outcome category']


In [7]:
print(f"{len(raw.columns)=}")

len(raw.columns)=10


In [8]:
raw.sample(5)

Unnamed: 0,Crime ID,Month,Reported by,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category
134897,c86bda9faec135c45f4cc85338c09335f1d245c984eed4...,2023-07,Metropolitan Police Service,-0.240749,51.560341,On or near Orchard Close,E01000502,Brent 011D,Burglary,Investigation complete; no suspect identified
111146,02c19e1f8ac705656a5b29a57d3a93d6897710032711f6...,2023-01,Metropolitan Police Service,-0.030599,51.598898,On or near Shaw Square,E01004403,Waltham Forest 009E,Burglary,Investigation complete; no suspect identified
71245,4e13de156d0c23e7c142ccd9d3f5fae01b5ee6e61e879d...,2022-05,Metropolitan Police Service,-0.15058,51.603737,On or near George Crescent,E01000164,Barnet 022D,Burglary,Investigation complete; no suspect identified
123214,d25b88b0f10cd8c0fb532927dff30c0a64cdef0ef5232e...,2023-04,Metropolitan Police Service,-0.394881,51.519002,On or near Blandford Waye,E01002546,Hillingdon 023D,Burglary,Investigation complete; no suspect identified
216504,0408b534f54ed318c5bb83b26d8f4a347c5e1925ecacdc...,2025-01,Metropolitan Police Service,-0.357286,51.519947,On or near The Grove,E01001218,Ealing 012A,Burglary,Investigation complete; no suspect identified


In [None]:
match_mask = raw['Reported by'] == raw['Falls within']
total = len(raw)
n_match = match_mask.sum()
n_diff  = total - n_match

print(f"✅ Matching rows: {n_match} / {total} ({n_match/total*100:.2f}%)")
print(f"⚠️ Differing rows: {n_diff} / {total} ({n_diff/total*100:.2f}%)")

In [None]:
n_null = raw['Context'].isna().sum()
n_present = raw['Context'].notna().sum()

print(f"🛑 Null in Context: {n_null}")
print(f"✅ Non-null in Context: {n_present}")

In [None]:
cleaned = raw.drop(columns=['Falls within', 'Context'])
print("Columns now:", cleaned.columns.tolist())

In [9]:
total = len(raw)
n_missing = raw['Crime ID'].isna().sum()
n_unique = raw['Crime ID'].nunique(dropna = True)
n_dupes = total - n_missing - (raw['Crime ID'].value_counts() == 1).sum()

print(f"Total rows: {total}")
print(f"Missing Crime IDs: {n_missing}")
print(f"Unique Crime IDs: {n_unique} ({n_unique/total*100:.2f}% of rows)")
print(f"Duplicate Crime IDs: {n_dupes} ({n_dupes/total*100:.2f}% of rows)")

Total rows: 223514
Missing Crime IDs: 0
Unique Crime IDs: 222006 (99.33% of rows)
Duplicate Crime IDs: 3016 (1.35% of rows)


In [10]:
dupe_mask = raw['Crime ID'].duplicated(keep = False)
dupes = raw.loc[dupe_mask].sort_values('Crime ID')
print(f"Total duplicate‐ID rows: {len(dupes)}")
print(f"Distinct Crime IDs that repeat: {dupes['Crime ID'].nunique()}")

Total duplicate‐ID rows: 3016
Distinct Crime IDs that repeat: 1508


In [11]:
# pick the cols you care about
check_cols = ['Month','Last outcome category','LSOA code','Longitude','Latitude']
dupes = raw.loc[dupe_mask]

# count how many IDs actually differ on each field
summary = {
  c: dupes.groupby('Crime ID')[c].nunique().gt(1).sum()
  for c in check_cols
}
print("IDs with >1 unique value per field:", summary)

IDs with >1 unique value per field: {'Month': 0, 'Last outcome category': 1508, 'LSOA code': 29, 'Longitude': 0, 'Latitude': 0}


In [12]:
raw_sorted = raw.sort_values(
    ['Crime ID', 'Month', 'Last outcome category'],
    ascending=[True, True, True]
)

In [13]:
# 1. Re-compute the mask of any ID with >1 row
dupe_mask = raw['Crime ID'].duplicated(keep=False)
dupes = raw.loc[dupe_mask].sort_values('Crime ID')

# 2. Find the subset of IDs where the outcome really changed
outcome_changed = (
    dupes
    .groupby('Crime ID')['Last outcome category']
    .nunique()
    .gt(1)
)
changed_ids = outcome_changed[outcome_changed].index.tolist()

# 3. Likewise for the handful of LSOA-code changes
lsoa_changed = (
    dupes
    .groupby('Crime ID')['LSOA code']
    .nunique()
    .gt(1)
)
changed_lsoas = lsoa_changed[lsoa_changed].index.tolist()

print(f"🔍 IDs with >1 outcome: {len(changed_ids)}")
print(f"🔍 IDs with >1 LSOA code: {len(changed_lsoas)}\n")

🔍 IDs with >1 outcome: 1508
🔍 IDs with >1 LSOA code: 29



In [14]:
# Only consider IDs with at least two rows
dupe_mask = raw['Crime ID'].duplicated(keep=False)
dupes = raw.loc[dupe_mask, ['Crime ID','Longitude','Latitude']]

# Count how many distinct (lon,lat) per ID
loc_counts = (
    dupes
    .dropna(subset=['Longitude','Latitude'])
    .drop_duplicates()
    .groupby('Crime ID')
    .size()
)

# Filter to those with more than one unique coordinate
ids_with_loc_shifts = loc_counts[loc_counts > 1].index
print(f"IDs with geo‐shifts: {len(ids_with_loc_shifts)}")


IDs with geo‐shifts: 0


In [15]:
from geopy.distance import geodesic
from tqdm import tqdm

# 1️⃣ Helper as before
def max_shift(group):
    coords = list(zip(group['Latitude'], group['Longitude']))
    max_d = 0
    for i in range(len(coords)):
        for j in range(i+1, len(coords)):
            d = geodesic(coords[i], coords[j]).meters
            if d > max_d:
                max_d = d
    return max_d

# 2️⃣ Compute shifts into a list of tuples
rows = []
grouped = raw.groupby('Crime ID')
for cid, grp in tqdm(grouped, total=grouped.ngroups, desc="Calculating shifts"):
    if len(grp[['Latitude','Longitude']].drop_duplicates()) > 1:
        rows.append((cid, max_shift(grp)))

# 3️⃣ Build the DataFrame
shifts_df = pd.DataFrame(rows, columns=['Crime ID','max_shift_m'])

# 4️⃣ Sort descending
shifts_df = shifts_df.sort_values('max_shift_m', ascending=False)

print("Top 5 biggest geocoding shifts (meters):")
print(shifts_df.head())

Calculating shifts: 100%|██████████| 222006/222006 [03:27<00:00, 1068.85it/s]

Top 5 biggest geocoding shifts (meters):
Empty DataFrame
Columns: [Crime ID, max_shift_m]
Index: []





In [16]:
raw = raw.sort_values(
    ["Crime ID", "Month", "Last outcome category"]
)
clean = raw.drop_duplicates("Crime ID", keep = "last")

In [17]:
bad_lsoas = set(changed_lsoas)
clean = clean[~clean['LSOA code'].isin(bad_lsoas)]

In [18]:
assert clean['Crime ID'].is_unique

In [20]:
total = len(clean)
print(total)

222006


In [21]:
clean.to_csv(burg_file.with_suffix('.clean.csv'), index = False)
print(f"Wrote {len(clean)} rows to {burg_file.with_suffix('.clean.csv')}")

Wrote 222006 rows to /Users/youjungum/dc2/data/burglary/burglary_2021_2025.clean.csv
