### 1) Inspect one DataFrame (cleaned or raw)





In [1]:
import pandas as pd

# Point to a file (cleaned or a raw CSV)
path = "london_crime_clean.csv"   # or r"C:\...\2025-08\2025-08-metropolitan-street.csv"

# Load a small sample to avoid heavy memory use
df = pd.read_csv(path, nrows=10_000)

# Basic structure
print("Shape (rows, cols):", df.shape)
print("\nColumn names:", list(df.columns))

print("\nDtypes:")
print(df.dtypes)

print("\nMemory (MB):", df.memory_usage(deep=True).sum()/1e6)

print("\n.info():")
df.info()

print("\nFirst 5 rows:")
display(df.head())


Shape (rows, cols): (10000, 17)

Column names: ['crime_id', 'month', 'reported_by', 'falls_within', 'longitude', 'latitude', 'location', 'lsoa_code', 'lsoa_name', 'crime_type', 'last_outcome_category', 'context', 'month_dt', 'crime_category', 'severity_weight', 'season', 'temporal_weight']

Dtypes:
crime_id                  object
month                     object
reported_by               object
falls_within              object
longitude                float64
latitude                 float64
location                  object
lsoa_code                 object
lsoa_name                 object
crime_type                object
last_outcome_category     object
context                  float64
month_dt                  object
crime_category            object
severity_weight          float64
season                    object
temporal_weight          float64
dtype: object

Memory (MB): 8.611929

.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (tot

Unnamed: 0,crime_id,month,reported_by,falls_within,longitude,latitude,location,lsoa_code,lsoa_name,crime_type,last_outcome_category,context,month_dt,crime_category,severity_weight,season,temporal_weight
0,6d7f30fffe77c6d35ec8d6f3b2d1ca8ae1930953054acb...,2025-08,Metropolitan Police Service,Metropolitan Police Service,-0.508514,50.812629,On or near Wakehurst Place,E01031464,Arun 007F,Violence and sexual offences,Under investigation,,2025-08-01,Violence_Sexual,3.0,Summer,0.737703
1,a0ffce437b5fd8b6fbc7a61ba29754fee8d95b2c0f1001...,2025-08,Metropolitan Police Service,Metropolitan Police Service,0.893999,51.147132,On or near Foxglove Green,E01024003,Ashford 006D,Violence and sexual offences,Under investigation,,2025-08-01,Violence_Sexual,3.0,Summer,0.737703
2,,2025-08,Metropolitan Police Service,Metropolitan Police Service,0.136513,51.588214,On or near Kingston Close,E01000027,Barking and Dagenham 001A,Anti-social behaviour,Outcome Pending,,2025-08-01,Antisocial_Behavior,1.0,Summer,0.737703
3,,2025-08,Metropolitan Police Service,Metropolitan Police Service,0.142112,51.589389,On or near A1112,E01000027,Barking and Dagenham 001A,Anti-social behaviour,Outcome Pending,,2025-08-01,Antisocial_Behavior,1.0,Summer,0.737703
4,,2025-08,Metropolitan Police Service,Metropolitan Police Service,0.140194,51.582356,On or near Hatch Grove,E01000027,Barking and Dagenham 001A,Anti-social behaviour,Outcome Pending,,2025-08-01,Antisocial_Behavior,1.0,Summer,0.737703


### 2) Quick data-quality snapshot (dtype, % missing, examples)


In [2]:
import numpy as np

def schema_summary(df, n_example=1):
    out = []
    for c in df.columns:
        s = df[c]
        miss_pct = s.isna().mean()*100
        dtype = s.dtype
        nunique = s.nunique(dropna=True)
        example = s.dropna().iloc[:n_example].tolist()
        out.append({
            "column": c,
            "dtype": str(dtype),
            "nunique": int(nunique),
            "%missing": round(miss_pct, 2),
            "example": example[0] if example else None
        })
    return pd.DataFrame(out).sort_values("%missing", ascending=False)

schema_summary(df)


Unnamed: 0,column,dtype,nunique,%missing,example
11,context,float64,0,100.0,
0,crime_id,object,8491,15.04,6d7f30fffe77c6d35ec8d6f3b2d1ca8ae1930953054acb...
9,crime_type,object,14,0.0,Violence and sexual offences
15,season,object,1,0.0,Summer
14,severity_weight,float64,11,0.0,3.0
13,crime_category,object,14,0.0,Violence_Sexual
12,month_dt,object,1,0.0,2025-08-01
10,last_outcome_category,object,12,0.0,Under investigation
8,lsoa_name,object,753,0.0,Arun 007F
1,month,object,1,0.0,2025-08


### 3) Peek categorical values (e.g., crime types)

In [3]:
# Top categories
if "crime_type" in df.columns:
    display(df["crime_type"].value_counts().head(20))

# After cleaning, you may have 'crime_category'
if "crime_category" in df.columns:
    display(df["crime_category"].value_counts().head(20))


crime_type
Violence and sexual offences    2947
Anti-social behaviour           1504
Vehicle crime                    896
Shoplifting                      851
Other theft                      694
Drugs                            677
Criminal damage and arson        620
Public order                     534
Burglary                         446
Theft from the person            326
Robbery                          254
Other crime                      127
Bicycle theft                     66
Possession of weapons             58
Name: count, dtype: int64

crime_category
Violence_Sexual        2947
Antisocial_Behavior    1504
Vehicle_Crime           896
Theft_Retail            851
Theft_Other             694
Drug_Offenses           677
Criminal_Damage         620
Public_Order            534
Burglary                446
Theft_Personal          326
Robbery                 254
Other                   127
Theft_Bicycle            66
Weapons                  58
Name: count, dtype: int64

### 4) Check missing values per column (fast)

In [4]:
df.isna().mean().sort_values(ascending=False).mul(100).round(2).rename("% missing")


context                  100.00
crime_id                  15.04
crime_type                 0.00
season                     0.00
severity_weight            0.00
crime_category             0.00
month_dt                   0.00
last_outcome_category      0.00
lsoa_name                  0.00
month                      0.00
lsoa_code                  0.00
location                   0.00
latitude                   0.00
longitude                  0.00
falls_within               0.00
reported_by                0.00
temporal_weight            0.00
Name: % missing, dtype: float64

### 5) Inspect multiple raw CSVs in your folder (columns + shapes)

In [5]:
import glob, os

BASE_DIR = r"C:\Users\hksai\OneDrive\Documents\Group_Porject_Doc\Data_Police_UK_ALL(2025)\2025-08"
files = sorted(glob.glob(os.path.join(BASE_DIR, "*.csv")))
print("Found CSVs:", len(files))
for f in files[:5]:  # show first few
    df0 = pd.read_csv(f, nrows=5000)   # sample
    print("\nFile:", os.path.basename(f))
    print("  shape(sample):", df0.shape)
    print("  columns:", list(df0.columns))

# Union of all columns across files (sampled)
all_cols = set()
for f in files[:10]:
    all_cols |= set(pd.read_csv(f, nrows=1000).columns)
print("\nUnion of columns across sampled files:", all_cols)


Found CSVs: 41

File: 2025-08-avon-and-somerset-street.csv
  shape(sample): (5000, 12)
  columns: ['Crime ID', 'Month', 'Reported by', 'Falls within', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type', 'Last outcome category', 'Context']

File: 2025-08-bedfordshire-street.csv
  shape(sample): (5000, 12)
  columns: ['Crime ID', 'Month', 'Reported by', 'Falls within', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type', 'Last outcome category', 'Context']

File: 2025-08-cambridgeshire-street.csv
  shape(sample): (5000, 12)
  columns: ['Crime ID', 'Month', 'Reported by', 'Falls within', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type', 'Last outcome category', 'Context']

File: 2025-08-cheshire-street.csv
  shape(sample): (5000, 12)
  columns: ['Crime ID', 'Month', 'Reported by', 'Falls within', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'LSOA name', 'Crime type', 'Last outcome category', 'Context']

File:

### 6) Numeric summary (quick stats)

In [6]:
df.describe().T  # numeric columns


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,10000.0,-0.105113,0.2166376,-3.237972,-0.249853,-0.195865,0.110713,0.893999
latitude,10000.0,51.545623,0.08313141,50.717819,51.531224,51.551073,51.58182,53.820109
context,0.0,,,,,,,
severity_weight,10000.0,1.86602,0.8687755,0.8,1.0,1.5,3.0,3.0
temporal_weight,10000.0,0.737703,4.441114e-14,0.737703,0.737703,0.737703,0.737703,0.737703


### 7) Save the schema to a CSV (handy for docs)

In [7]:
schema_summary(df).to_csv("schema_summary.csv", index=False)
print("Saved -> schema_summary.csv")


Saved -> schema_summary.csv


##### These all commands will show shapes, column names, data types, missingness, example values, and category distributionsâ€”everything to understand the structure quickly.