In [15]:
import pandas as pd

# Load the cleaned dataset
file_path = '/Users/jamesjackson/Documents/liverpool_crime_analysis/csv_files/clean_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows in a table format
df.head()

# Show dataframe info
df.info()

# Summary statistics for all columns (including object types)
df.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150535 entries, 0 to 150534
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   crime_id               150535 non-null  object 
 1   month                  150535 non-null  object 
 2   longitude              150535 non-null  float64
 3   latitude               150535 non-null  float64
 4   location               150535 non-null  object 
 5   lsoa_code              150535 non-null  object 
 6   lsoa_name              150535 non-null  object 
 7   crime_type             150535 non-null  object 
 8   last_outcome_category  150535 non-null  object 
dtypes: float64(2), object(7)
memory usage: 10.3+ MB


Unnamed: 0,crime_id,month,longitude,latitude,location,lsoa_code,lsoa_name,crime_type,last_outcome_category
count,150535,150535,150535.0,150535.0,150535,150535,150535,150535,150535
unique,134284,12,,,12163,952,952,14,15
top,ASB_FILL,2024-07,,,On or near Parking Area,E01033760,Liverpool 060C,Violence and sexual offences,Unable to prosecute suspect
freq,16252,13560,,,5590,5348,5348,58336,55163
mean,,,-2.933296,53.434051,,,,,
std,,,0.104199,0.063046,,,,,
min,,,-3.193282,53.291663,,,,,
25%,,,-2.993765,53.400302,,,,,
50%,,,-2.96239,53.421043,,,,,
75%,,,-2.884973,53.453746,,,,,


In [16]:
#convert 'month' column from string to datetime format for time-based analysis

# Convert 'month' strings like '2024-06' to pandas datetime objects set to first of each month
df['month'] = pd.to_datetime(df['month'], format='%Y-%m')

# Quick check to confirm conversion worked
print(df['month'].head())
print(df['month'].dtype)

0   2024-06-01
1   2024-06-01
2   2024-06-01
3   2024-06-01
4   2024-06-01
Name: month, dtype: datetime64[ns]
datetime64[ns]


In [17]:
# Check for missing values
missing_values = df.isnull().sum()
display(missing_values)

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Count of ASB_FILL placeholder IDs
asb_fill_count = df[df['crime_id'] == 'ASB_FILL'].shape[0]
print(f"Number of ASB_FILL rows: {asb_fill_count}")

crime_id                 0
month                    0
longitude                0
latitude                 0
location                 0
lsoa_code                0
lsoa_name                0
crime_type               0
last_outcome_category    0
dtype: int64

Number of duplicate rows: 4275
Number of ASB_FILL rows: 16252


In [18]:
# Check how many fully duplicated rows exist
duplicate_count = df.duplicated().sum()
print(f"Number of fully duplicated rows: {duplicate_count}")

Number of fully duplicated rows: 4275


In [19]:
# Get all rows where the entire row is duplicated (including the first occurrence)
duplicates_all = df[df.duplicated(keep=False)].sort_values(by=df.columns.tolist())

# Display the duplicates with their matching pairs next to each other
display(duplicates_all)

Unnamed: 0,crime_id,month,longitude,latitude,location,lsoa_code,lsoa_name,crime_type,last_outcome_category
12298,ASB_FILL,2024-06-01,-3.184808,53.369888,On or near Victoria Drive,E01007198,Wirral 026C,Anti-social behaviour,ASB_FILL
12300,ASB_FILL,2024-06-01,-3.184808,53.369888,On or near Victoria Drive,E01007198,Wirral 026C,Anti-social behaviour,ASB_FILL
12301,ASB_FILL,2024-06-01,-3.184808,53.369888,On or near Victoria Drive,E01007198,Wirral 026C,Anti-social behaviour,ASB_FILL
12302,ASB_FILL,2024-06-01,-3.184808,53.369888,On or near Victoria Drive,E01007198,Wirral 026C,Anti-social behaviour,ASB_FILL
12308,ASB_FILL,2024-06-01,-3.184808,53.369888,On or near Victoria Drive,E01007198,Wirral 026C,Anti-social behaviour,ASB_FILL
...,...,...,...,...,...,...,...,...,...
146773,ASB_FILL,2025-05-01,-2.713345,53.468208,On or near Parking Area,E01006852,St. Helens 004D,Anti-social behaviour,ASB_FILL
146774,ASB_FILL,2025-05-01,-2.713345,53.468208,On or near Parking Area,E01006852,St. Helens 004D,Anti-social behaviour,ASB_FILL
146775,ASB_FILL,2025-05-01,-2.713345,53.468208,On or near Parking Area,E01006852,St. Helens 004D,Anti-social behaviour,ASB_FILL
147105,ASB_FILL,2025-05-01,-2.706024,53.453681,On or near Delves Close,E01006871,St. Helens 011D,Anti-social behaviour,ASB_FILL


In [20]:
# Check which columns are responsible for the duplication
# This checks for duplicates on a subset of columns instead of full row
dupe_summary = df[df.duplicated(subset=df.columns)].groupby(list(df.columns)).size().reset_index(name='count')
display(dupe_summary.sort_values(by='count', ascending=False).head(10))

Unnamed: 0,crime_id,month,longitude,latitude,location,lsoa_code,lsoa_name,crime_type,last_outcome_category,count
721,ASB_FILL,2024-09-01,-2.922261,53.439209,On or near Broad View,E01006600,Liverpool 011A,Anti-social behaviour,ASB_FILL,27
730,ASB_FILL,2024-09-01,-2.910052,53.432849,On or near West Derby Village,E01006617,Liverpool 021A,Anti-social behaviour,ASB_FILL,16
1000,ASB_FILL,2024-10-01,-2.910052,53.432849,On or near West Derby Village,E01006617,Liverpool 021A,Anti-social behaviour,ASB_FILL,13
1961,ASB_FILL,2025-03-01,-2.862469,53.422271,On or near Supermarket,E01033228,Knowsley 008F,Anti-social behaviour,ASB_FILL,13
1055,ASB_FILL,2024-10-01,-2.854324,53.390528,On or near Parking Area,E01006771,Liverpool 040D,Anti-social behaviour,ASB_FILL,12
1717,ASB_FILL,2025-02-01,-2.979298,53.410269,On or near Mill Lane,E01006648,Liverpool 023D,Anti-social behaviour,ASB_FILL,12
1622,ASB_FILL,2025-01-01,-2.838648,53.345092,On or near Supermarket,E01006755,Liverpool 059C,Anti-social behaviour,ASB_FILL,12
275,ASB_FILL,2024-07-01,-2.982817,53.423247,On or near Hopwood Street,E01006779,Liverpool 022D,Anti-social behaviour,ASB_FILL,11
2105,ASB_FILL,2025-04-01,-2.947548,53.39573,On or near Longfellow Street,E01006556,Liverpool 039B,Anti-social behaviour,ASB_FILL,11
1572,ASB_FILL,2025-01-01,-2.949768,53.481151,On or near Shopping Area,E01007041,Sefton 027A,Anti-social behaviour,ASB_FILL,11


In [21]:
# Only ASB cases are duplicated. Likely ASB incidents where 2 or more persons were involved
# and the only unique entry would be the crime_id. To test this, can check for non-ASB incidents 
# where all entries are identical excluding crime_id.

# Check duplicates ignoring 'crime_id'
cols_to_check = df.columns.drop('crime_id')

# Find duplicated rows based on all columns except 'crime_id'
duplicates_ignore_crime_id = df[df.duplicated(subset=cols_to_check, keep=False)]

# Filter out ASB_FILL rows to focus on non-ASB crimes
non_asb_duplicates = duplicates_ignore_crime_id[duplicates_ignore_crime_id['crime_id'] != 'ASB_FILL']

print(f"Number of non-ASB duplicated rows (ignoring crime_id): {non_asb_duplicates.shape[0]}")

# Display a sample to inspect
import pandas as pd
pd.set_option('display.max_rows', 20)
print(non_asb_duplicates.head(10))

Number of non-ASB duplicated rows (ignoring crime_id): 41981
                                             crime_id      month  longitude  \
12  8dc426612ab21aaafa2fe54e49104eb6aa8cbdcbfb6c51... 2024-06-01  -2.871827   
13  1b6b9ab85589f482e1ea04d26256ad719bbf8e9d674546... 2024-06-01  -2.874344   
14  b2be404dc0df0c155a3b13ccabe64dcccebf80893414dd... 2024-06-01  -2.872892   
16  1d2b6dde4d2840c54c884858981ae52a2fcca0ff063c9b... 2024-06-01  -2.872892   
17  0cb3fbaed2b14b7db382c06bbee9ecd1efafae364241c1... 2024-06-01  -2.874344   
18  a3053f98f8ef7e69cd7810b2c42803031b5d9a54ce54ff... 2024-06-01  -2.871827   
43  ec7bebb2266eb5339e23291641ea55c8f86833b4c7bf09... 2024-06-01  -2.873049   
48  d40d06127da4ea7b4ca0184280aa0666f3979569c47e22... 2024-06-01  -2.873049   
53  baae0d64d144e9cb3d9090bf5eb4dfbb4ca6e4acbb8c19... 2024-06-01  -2.885932   
55  f47998bff1349efe86eba84af48313a8e160814b7ea31c... 2024-06-01  -2.885932   

     latitude                      location  lsoa_code      lsoa_name

In [22]:
# Confirmed one row per person, not suitable to remove any duplicate rows.

In [24]:
# Produce a cleaning report to conduct final checks before beginning EDA section

def cleaning_report(df):
    print("=== Data Cleaning Verification Report ===\n")
    
    # Check for missing values
    missing = df.isnull().sum()
    print("Missing values per column:")
    print(missing[missing > 0] if missing.any() else "None found")
    print()
    
    # Check for duplicate rows
    dup_count = df.duplicated().sum()
    print(f"Duplicate rows: {dup_count}")
    
    # Check for trailing/leading whitespace in object columns
    str_cols = df.select_dtypes(include='object').columns
    whitespace_issues = {}
    for col in str_cols:
        if df[col].str.contains(r'^\s+|\s+$', regex=True).any():
            whitespace_issues[col] = df[col].apply(lambda x: x if isinstance(x, str) and (x.startswith(' ') or x.endswith(' ')) else None).dropna().unique()
    if whitespace_issues:
        print("\nColumns with leading/trailing whitespace detected:")
        for col, examples in whitespace_issues.items():
            print(f" - {col}: {examples[:3]}")  # show up to 3 examples
    else:
        print("\nNo leading/trailing whitespace detected in string columns.")
    print()
    
    # Check for invalid dates (e.g., future dates) if 'month' column exists
    if 'month' in df.columns:
        import pandas as pd
        today = pd.Timestamp.today()
        future_dates = df[df['month'] > today]
        print(f"Future dates in 'month' column: {len(future_dates)}")
    print("\n=== End of Report ===")

# Run the report on your dataframe
cleaning_report(df)

=== Data Cleaning Verification Report ===

Missing values per column:
None found

Duplicate rows: 4275

No leading/trailing whitespace detected in string columns.

Future dates in 'month' column: 0

=== End of Report ===
