## Exploratory Data analysis

### Import libraries and load data

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [4]:
# load dataset
df = pd.read_csv('../dataset/cleaned_data.csv')
df.head()

Unnamed: 0,Date,Measure,Borough,Crime Section,Crime group,Outcome,Positive Outcome,Outcome Count,Refresh Date,Year,Month,Weekday
0,2023-06-01,Business Crime Outcomes,Barking and Dagenham,POSSESSION OF WEAPONS,POSSESSION OF WEAPONS,Investigation Complete; No Suspect Identified....,N,1,2025-06-05,2023,6,Thursday
1,2023-06-01,Business Crime Outcomes,Barking and Dagenham,THEFT,OTHER THEFT,Investigation Complete; No Suspect Identified....,N,2,2025-06-05,2023,6,Thursday
2,2023-06-01,Business Crime Outcomes,Barking and Dagenham,THEFT,SHOPLIFTING,Named Suspect Identified: Victim Supports Poli...,N,2,2025-06-05,2023,6,Thursday
3,2023-06-01,Business Crime Outcomes,Barking and Dagenham,VIOLENCE AGAINST THE PERSON,VIOLENCE WITH INJURY,Named Suspect Identified: Evidential Difficult...,N,2,2025-06-05,2023,6,Thursday
4,2023-06-01,Business Crime Outcomes,Barnet,ARSON AND CRIMINAL DAMAGE,CRIMINAL DAMAGE,Named Suspect Identified: Victim Supports Poli...,N,1,2025-06-05,2023,6,Thursday


### Check data structure

In [None]:
# check for data types
df.dtypes

Date                object
Measure             object
Borough             object
Crime Section       object
Crime group         object
Outcome             object
Positive Outcome    object
Outcome Count        int64
Refresh Date        object
Year                 int64
Month                int64
Weekday             object
dtype: object

I will convert Date and Refresh date into datetime structure

In [10]:
# convert into date format
df['Date'] = pd.to_datetime(df['Date'])
df['Refresh Date'] = pd.to_datetime(df['Refresh Date'])
df.dtypes

Date                datetime64[ns]
Measure                     object
Borough                     object
Crime Section               object
Crime group                 object
Outcome                     object
Positive Outcome            object
Outcome Count                int64
Refresh Date        datetime64[ns]
Year                         int64
Month                        int64
Weekday                     object
dtype: object

In [None]:
# Overview of data
df.describe(include=['object','int64'])

Unnamed: 0,Measure,Borough,Crime Section,Crime group,Outcome,Positive Outcome,Outcome Count,Year,Month,Weekday
count,395120,395120,395120,395120,395120,395120,395120.0,395120.0,395120.0,395120
unique,1,32,13,31,31,2,,,,7
top,Business Crime Outcomes,Westminster,THEFT,VIOLENCE WITHOUT INJURY,Investigation Complete; No Suspect Identified....,N,,,,Wednesday
freq,395120,22041,112790,54621,186169,346447,,,,62311
mean,,,,,,,2.150878,2023.936465,6.673484,
std,,,,,,,3.372016,0.696979,3.495496,
min,,,,,,,1.0,2023.0,1.0,
25%,,,,,,,1.0,2023.0,4.0,
50%,,,,,,,1.0,2024.0,7.0,
75%,,,,,,,2.0,2024.0,10.0,


In [None]:
# check the proportion in each column
for col in df.select_dtypes(include='object').columns:
    print(f"\n Normalised value counts for: '{col}'")
    normalised_counts = df[col].value_counts(normalize=True)*100
    for category, percentage in normalised_counts.items():
         print(f"  {category}: {percentage:.2f}%")
    


 Normalised value counts for: 'Measure'
  Business Crime Outcomes: 100.00%

 Normalised value counts for: 'Borough'
  Westminster: 5.58%
  Camden: 4.15%
  Tower Hamlets: 4.04%
  Newham: 3.95%
  Lambeth: 3.92%
  Southwark: 3.91%
  Hackney: 3.72%
  Brent: 3.63%
  Islington: 3.61%
  Hillingdon: 3.49%
  Croydon: 3.46%
  Ealing: 3.41%
  Barnet: 3.28%
  Wandsworth: 3.27%
  Greenwich: 3.26%
  Lewisham: 3.23%
  Haringey: 3.15%
  Hounslow: 3.14%
  Enfield: 3.04%
  Hammersmith and Fulham: 2.98%
  Redbridge: 2.90%
  Bromley: 2.87%
  Kensington and Chelsea: 2.69%
  Waltham Forest: 2.69%
  Havering: 2.63%
  Barking and Dagenham: 2.52%
  Harrow: 2.07%
  Kingston upon Thames: 2.02%
  Merton: 1.99%
  Bexley: 1.98%
  Sutton: 1.72%
  Richmond upon Thames: 1.68%

 Normalised value counts for: 'Crime Section'
  THEFT: 28.55%
  VIOLENCE AGAINST THE PERSON: 22.75%
  PUBLIC ORDER OFFENCES: 10.17%
  VEHICLE OFFENCES: 7.94%
  ARSON AND CRIMINAL DAMAGE: 7.64%
  BURGLARY: 7.27%
  ROBBERY: 5.91%
  DRUG OFFENCES:

# check 