<a href="https://colab.research.google.com/github/Mr-Dipayan-Dey/CrimeDataAnalysis/blob/main/p1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
# --- Setup --- import os from pathlib import Path import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

In [69]:
# --- Setup ---

import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [70]:
from pathlib import Path

INPUT_PATH = "C:/Users/USER/Downloads/crime_data_cleaned.csv"
OUT_DIR = Path('./outputs')
OUT_DIR.mkdir(exist_ok=True)

In [71]:
# --- Utility functions ---
def save_df(df, name):
    path = OUT_DIR / f"{name}.csv"
    df.to_csv(path, index=False)
    print(f"Saved table: {path}")

In [72]:
def save_fig(fig, name, tight=True):
    path = OUT_DIR / f"{name}.png"
    if tight:
        fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved figure: {path}")

In [73]:

print('Loading dataset...')
df = pd.read_csv("crime_data_cleaned.csv")
print('Rows, cols:', df.shape)
df.head()

Loading dataset...
Rows, cols: (40160, 13)


Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,City,Crime Code,Crime Description,Victim Age,Victim Gender,Weapon Used,Crime Domain,Police Deployed,Case Closed
0,1,02-01-2020 00:00,01-01-2020 00:00,01-01-2020 01:11,Ahmedabad,576,IDENTITY THEFT,16,M,Blunt Object,Violent Crime,13,No
1,2,01-01-2020 19:00,01-01-2020 01:00,01-01-2020 06:26,Chennai,128,HOMICIDE,37,M,Poison,Other Crime,9,No
2,3,02-01-2020 05:00,01-01-2020 02:00,01-01-2020 14:30,Ludhiana,271,KIDNAPPING,48,F,Blunt Object,Other Crime,15,No
3,4,01-01-2020 05:00,01-01-2020 03:00,01-01-2020 14:46,Pune,170,BURGLARY,49,F,Firearm,Other Crime,1,Yes
4,5,01-01-2020 21:00,01-01-2020 04:00,01-01-2020 16:51,Pune,421,VANDALISM,30,F,Other,Other Crime,18,Yes


In [74]:
# --- Data Cleaning / Preprocessing ---
# Strip column names
df.columns = [c.strip() for c in df.columns]

In [75]:
# Parse dates if available
for col in ['Date Reported', 'Date of Occurrence', 'Date Case Closed']:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], dayfirst=True, errors='coerce')

In [76]:
# Numeric conversion if present
if 'Police Deployed' in df.columns:
    df['Police Deployed'] = pd.to_numeric(df['Police Deployed'], errors='coerce').fillna(0).astype(int)

In [77]:
# Strip strings (only if column exists)
for col in ['Crime Domain', 'Weapon Used', 'Crime Description', 'Victim Gender', 'City']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

In [78]:
# Case closure flag
if 'Case Closed' in df.columns:
    df['Case_Closed_Flag'] = df['Case Closed'].str.lower().map({'yes': 1, 'no': 0}).fillna(0).astype(int)


In [79]:
# Time to close (days)
if 'Date Reported' in df.columns and 'Date Case Closed' in df.columns:
    df['Time_to_Close_days'] = (df['Date Case Closed'] - df['Date Reported']).dt.total_seconds() / (24*3600)

In [80]:
## 1. Crime Trends by City


In [81]:
# Crime counts by city
if 'City' in df.columns:
    city_counts = df['City'].value_counts().reset_index()
    city_counts.columns = ['City', 'Crime_Count']
    save_df(city_counts, 'city_crime_counts')


Saved table: outputs/city_crime_counts.csv


In [82]:
# Top 15 cities
plt.figure(figsize=(10,6))
sns.barplot(data=city_counts.head(15), x='Crime_Count', y='City')
plt.title('Top 15 Cities by Crime Count')
save_fig(plt.gcf(), 'top15_cities_crime_count')
plt.show()

Saved figure: outputs/top15_cities_crime_count.png


In [83]:
 # Distribution of crime domains across top 10 cities
if 'City' in df.columns and 'Crime Domain' in df.columns:
    top_cities = df['City'].value_counts().head(10).index
    domain_by_city = df[df['City'].isin(top_cities)].groupby(['City','Crime Domain']).size().unstack(fill_value=0)

In [84]:
fig, ax = plt.subplots(figsize=(12,6))
domain_by_city.plot(kind='bar', stacked=True, ax=ax)
plt.title('Crime Domain Distribution across Top 10 Cities')
plt.ylabel('Count')
save_fig(fig, 'domain_distribution_top10_cities')
plt.show()

Saved figure: outputs/domain_distribution_top10_cities.png


In [85]:
## 2. Crime Type Analysis


In [86]:
# Top crime descriptions
if 'Crime Description' in df.columns:
    top_descriptions = df['Crime Description'].value_counts().reset_index()
    top_descriptions.columns = ['Crime Description','Count']
    save_df(top_descriptions, 'top_crime_descriptions')

Saved table: outputs/top_crime_descriptions.csv


In [87]:
plt.figure(figsize=(10,6))
sns.barplot(data=top_descriptions.head(15), x='Count', y='Crime Description')
plt.title('Top 15 Crime Descriptions')
save_fig(plt.gcf(), 'top15_crime_descriptions')
plt.show()

Saved figure: outputs/top15_crime_descriptions.png


In [88]:
# Crime domain counts
if 'Crime Domain' in df.columns:
    domain_counts = df['Crime Domain'].value_counts().reset_index()
    domain_counts.columns = ['Crime Domain','Count']
    save_df(domain_counts, 'crime_domain_counts')


Saved table: outputs/crime_domain_counts.csv


In [90]:
plt.figure(figsize=(8,5))
sns.barplot(data=domain_counts, x='Count', y='Crime Domain')
plt.title('Crime Domain Counts')
save_fig(plt.gcf(), 'crime_domain_counts')
plt.show()

Saved figure: outputs/crime_domain_counts.png


In [91]:
# Top weapons used
if 'Weapon Used' in df.columns:
    weapon_counts = df['Weapon Used'].value_counts().reset_index()
    weapon_counts.columns = ['Weapon Used','Count']
    save_df(weapon_counts, 'weapon_counts')

Saved table: outputs/weapon_counts.csv


In [92]:
# Top weapons used
if 'Weapon Used' in df.columns:
    weapon_counts = df['Weapon Used'].value_counts().reset_index()
    weapon_counts.columns = ['Weapon Used','Count']
    save_df(weapon_counts, 'weapon_counts')

Saved table: outputs/weapon_counts.csv


In [94]:
plt.figure(figsize=(10,6))
sns.barplot(data=weapon_counts.head(15), x='Count', y='Weapon Used')
plt.title('Top 15 Weapons Used')
save_fig(plt.gcf(), 'top15_weapons')
plt.show()

Saved figure: outputs/top15_weapons.png


In [95]:
## 3. Victim Demographics

In [96]:
# Age distribution
if 'Victim Age' in df.columns:
    age_stats = df['Victim Age'].describe()
    print(age_stats)
    save_df(age_stats.reset_index(), 'victim_age_stats')


count    40160.00000
mean        44.49126
std         20.22555
min         10.00000
25%         27.00000
50%         44.00000
75%         62.00000
max         79.00000
Name: Victim Age, dtype: float64
Saved table: outputs/victim_age_stats.csv


In [99]:
plt.figure(figsize=(10,5))
sns.histplot(df['Victim Age'], kde=True, bins=30)
plt.title('Victim Age Distribution')
plt.xlabel('Age')
save_fig(plt.gcf(), 'victim_age_distribution')
plt.show()

Saved figure: outputs/victim_age_distribution.png


In [100]:
# Gender distribution
if 'Victim Gender' in df.columns:
    gender_counts = df['Victim Gender'].value_counts().reset_index()
    gender_counts.columns = ['Victim Gender','Count']
    save_df(gender_counts, 'victim_gender_counts')

    plt.figure(figsize=(6,4))
    sns.barplot(data=gender_counts, x='Victim Gender', y='Count')
    plt.title('Victim Gender Distribution')
    save_fig(plt.gcf(), 'victim_gender_distribution')
    plt.show()


Saved table: outputs/victim_gender_counts.csv
Saved figure: outputs/victim_gender_distribution.png


In [101]:
# Gender distribution across top 10 crime descriptions
if 'Victim Gender' in df.columns and 'Crime Description' in df.columns:
    top10_desc = df['Crime Description'].value_counts().head(10).index
    gender_by_desc = df[df['Crime Description'].isin(top10_desc)].groupby(['Crime Description','Victim Gender']).size().unstack(fill_value=0)

    fig, ax = plt.subplots(figsize=(12,6))
    gender_by_desc.plot(kind='bar', stacked=True, ax=ax)
    plt.title('Gender Distribution across Top 10 Crime Descriptions')
    save_fig(fig, 'gender_by_top10_crimes')
    plt.show()


Saved figure: outputs/gender_by_top10_crimes.png


In [102]:
## 4. Police Effectiveness


In [104]:
# Average police deployed
if 'Police Deployed' in df.columns:
    avg_police_overall = df['Police Deployed'].mean()
    print('Average police deployed per case:', avg_police_overall)

    police_by_city = df.groupby('City')['Police Deployed'].agg(['mean','median','count']).reset_index()
    save_df(police_by_city, 'police_deployed_by_city')


Average police deployed per case: 10.00625
Saved table: outputs/police_deployed_by_city.csv


In [103]:
# Correlation between police and closure
if 'Police Deployed' in df.columns and 'Case_Closed_Flag' in df.columns:
    corr = df[['Police Deployed','Case_Closed_Flag']].corr().loc['Police Deployed','Case_Closed_Flag']
    print('Correlation Police vs Case Closure:', corr)

    plt.figure(figsize=(8,5))
    sns.boxplot(data=df, x='Case_Closed_Flag', y='Police Deployed')
    plt.xlabel('Case Closed (0=No, 1=Yes)')
    plt.title('Police Deployed by Case Closure')
    save_fig(plt.gcf(), 'police_by_case_closed')
    plt.show()


Correlation Police vs Case Closure: -0.0025355196560186977
Saved figure: outputs/police_by_case_closed.png


In [105]:
## 5. Case Closure


In [106]:
# Open vs closed
if 'Case_Closed_Flag' in df.columns:
    closure_counts = df['Case_Closed_Flag'].value_counts().rename(index={1:'Closed',0:'Open'}).reset_index()
    closure_counts.columns = ['Status','Count']
    closure_counts['Pct'] = closure_counts['Count'] / closure_counts['Count'].sum() * 100
    print(closure_counts)
    save_df(closure_counts, 'case_closure_counts')

    plt.figure(figsize=(6,6))
    plt.pie(closure_counts['Count'], labels=closure_counts['Status'], autopct='%1.1f%%')
    plt.title('Case Closure: Closed vs Open')
    save_fig(plt.gcf(), 'case_closure_pie')
    plt.show()


   Status  Count        Pct
0    Open  20098  50.044821
1  Closed  20062  49.955179
Saved table: outputs/case_closure_counts.csv
Saved figure: outputs/case_closure_pie.png


In [107]:
# Time to close stats
if 'Time_to_Close_days' in df.columns and 'Case_Closed_Flag' in df.columns:
    time_stats = df.loc[df['Case_Closed_Flag']==1, 'Time_to_Close_days'].describe()
    print(time_stats)
    save_df(time_stats.reset_index(), 'time_to_close_stats')

    plt.figure(figsize=(10,5))
    sns.histplot(df.loc[df['Case_Closed_Flag']==1, 'Time_to_Close_days'], bins=50, kde=True)
    plt.title('Time to Close Distribution (days)')
    save_fig(plt.gcf(), 'time_to_close_distribution')
    plt.show()


In [108]:
# Police deployed vs time to close
if 'Police Deployed' in df.columns and 'Time_to_Close_days' in df.columns:
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=df[df['Case_Closed_Flag']==1], x='Police Deployed', y='Time_to_Close_days', alpha=0.4)
    plt.title('Police Deployed vs Time to Close (Closed Cases)')
    save_fig(plt.gcf(), 'police_vs_time_to_close')
    plt.show()
