# Crime Data - Exploratory Data Analysis

This notebook provides an exploratory analysis of the Los Angeles crime dataset to understand patterns, relationships, and insights that will inform our machine learning approach.

## Analysis Sections:
1. **Data Loading & Overview**
2. **Temporal Analysis** - Crime patterns over time
3. **Spatial Analysis** - Geographic crime distribution
4. **Crime Type Analysis** - Understanding different crime categories
5. **Victim Demographics** - Age, gender, descent patterns
6. **Location & Context** - Premises and environmental factors
7. **Correlation Analysis** - Feature relationships
8. **Missing Data Patterns** - Data quality assessment
9. **Key Insights & Recommendations**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from scipy import stats
from scipy.stats import chi2_contingency


plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)


In [None]:
print("Loading Crime Dataset...")


df = pd.read_csv('./data/crime_data.csv')


# Basic dataset information
print(f"\nDataset Overview:")
print(f"   Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")
print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"   Date Range: {df['DATE OCC'].min()} to {df['DATE OCC'].max()}")

# Display first few rows
print(f"\nFirst 3 rows:")
df.head(3)


### 2.1 Comprehensive Data Information Analysis

This analysis provides detailed information about each column in the dataset, including:
- Data types for each feature
- Missing value counts and percentages
- Number of unique values per feature
- Overall data quality assessment


In [None]:
# Comprehensive data information
print("COMPREHENSIVE DATA INFORMATION")
print("="*50)

# Column information
print("\nColumn Information:")
print("-" * 30)
for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    null_count = df[col].isnull().sum()
    null_pct = (null_count / len(df)) * 100
    unique_count = df[col].nunique()
    
    print(f"{i:2d}. {col:<20} | {str(dtype):<10} | {null_count:>6} nulls ({null_pct:>5.1f}%) | {unique_count:>6} unique")

# Data types summary
print(f"\nData Types Summary:")
print("-" * 25)
print(df.dtypes.value_counts())

# Missing values summary
print(f"\nMissing Values Summary:")
print("-" * 28)
missing_summary = df.isnull().sum().sort_values(ascending=False)
missing_summary = missing_summary[missing_summary > 0]
if len(missing_summary) > 0:
    for col, count in missing_summary.items():
        pct = (count / len(df)) * 100
        print(f"{col:<20}: {count:>6} ({pct:>5.1f}%)")
else:
    print("No missing values found!")

print(f"\nData Information Analysis Complete!")


## 3. Temporal Analysis - Crime Patterns Over Time

Understanding temporal patterns is crucial for crime prediction and resource allocation. This section explores:
- **Daily patterns** - Hour of day when crimes occur
- **Weekly patterns** - Day of week variations
- **Monthly patterns** - Seasonal trends
- **Yearly trends** - Long-term changes
- **Time-based correlations** - Relationships between time and crime types

### 3.1 Feature Engineering for Temporal Analysis

We create time-based features from the raw date and time columns to facilitate temporal analysis.


In [None]:
# Temporal feature engineering
print("Temporal Feature Engineering")

# Convert date columns to datetime
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])
df['Date Rptd'] = pd.to_datetime(df['Date Rptd'])

# Extract temporal features
df['year'] = df['DATE OCC'].dt.year
df['month'] = df['DATE OCC'].dt.month
df['day'] = df['DATE OCC'].dt.day
df['day_of_week'] = df['DATE OCC'].dt.dayofweek
df['day_name'] = df['DATE OCC'].dt.day_name()
df['month_name'] = df['DATE OCC'].dt.month_name()

# Convert TIME OCC to hour
df['hour'] = df['TIME OCC'] // 100
df['minute'] = df['TIME OCC'] % 100

# Create time categories
def categorize_time(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return 'Night'

df['time_category'] = df['hour'].apply(categorize_time)

# Create season from month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Automn'

df['season'] = df['month'].apply(get_season)

print(f"Temporal features created:")
print(f"    Date range: {df['DATE OCC'].min().strftime('%Y-%m-%d')} to {df['DATE OCC'].max().strftime('%Y-%m-%d')}")
print(f"    Time range: {df['hour'].min()}:00 to {df['hour'].max()}:00")
print(f"    Years covered: {df['year'].nunique()} years ({df['year'].min()}-{df['year'].max()})")
print(f"    Total days: {(df['DATE OCC'].max() - df['DATE OCC'].min()).days} days")

# Display sample of temporal features
print(f"\nSample of temporal features:")
temporal_cols = ['DATE OCC', 'TIME OCC', 'year', 'month', 'day', 'hour', 'day_name', 'time_category', 'season']
df[temporal_cols].head(10)


In [None]:
# Temporal Patterns Visualization
print("\nTEMPORAL PATTERNS VISUALIZATION")
print("="*50)

# Create a comprehensive temporal analysis plot
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Crime Temporal Patterns Analysis', fontsize=16, y=1.02)

# 1. Hourly distribution
hourly_crimes = df['hour'].value_counts().sort_index()
axes[0, 0].bar(hourly_crimes.index, hourly_crimes.values, color='skyblue', alpha=0.8)
axes[0, 0].set_title('Crimes by Hour of Day')
axes[0, 0].set_xlabel('Hour')
axes[0, 0].set_ylabel('Number of Crimes')
axes[0, 0].grid(True, alpha=0.3)

# 2. Day of week distribution
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_crimes = df['day_name'].value_counts().reindex(day_order)
axes[0, 1].bar(day_crimes.index, day_crimes.values, color='lightcoral', alpha=0.8)
axes[0, 1].set_title('Crimes by Day of Week')
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Number of Crimes')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# 3. Monthly distribution
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
month_crimes = df['month_name'].value_counts().reindex(month_order)
axes[0, 2].bar(month_crimes.index, month_crimes.values, color='lightgreen', alpha=0.8)
axes[0, 2].set_title('Crimes by Month')
axes[0, 2].set_xlabel('Month')
axes[0, 2].set_ylabel('Number of Crimes')
axes[0, 2].tick_params(axis='x', rotation=45)
axes[0, 2].grid(True, alpha=0.3)

# 4. Time category distribution
time_crimes = df['time_category'].value_counts()
axes[1, 0].pie(time_crimes.values, labels=time_crimes.index, autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Crime Distribution by Time Category')

# 5. Seasonal distribution
season_crimes = df['season'].value_counts()
axes[1, 1].pie(season_crimes.values, labels=season_crimes.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Crime Distribution by Season')

# 6. Yearly trend
yearly_crimes = df['year'].value_counts().sort_index()
axes[1, 2].plot(yearly_crimes.index, yearly_crimes.values, marker='o', linewidth=2, markersize=8)
axes[1, 2].set_title('Crime Trend Over Years')
axes[1, 2].set_xlabel('Year')
axes[1, 2].set_ylabel('Number of Crimes')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print key insights
print("\nKEY TEMPORAL INSIGHTS:")
print("-" * 30)
peak_hour = hourly_crimes.idxmax()
peak_day = day_crimes.idxmax()
peak_month = month_crimes.idxmax()
peak_season = season_crimes.idxmax()
peak_time_category = time_crimes.idxmax()

print(f"Top crime hour: {peak_hour}:00 ({hourly_crimes[peak_hour]:,} crimes)")
print(f"Top crime day: {peak_day} ({day_crimes[peak_day]:,} crimes)")
print(f"Top crime month: {peak_month} ({month_crimes[peak_month]:,} crimes)")
print(f"Top crime season: {peak_season} ({season_crimes[peak_season]:,} crimes)")
print(f"Top time category: {peak_time_category} ({time_crimes[peak_time_category]:,} crimes)")

# Crime intensity analysis
print(f"\n Crime Intensity Analysis:")
print(f"    Average crimes per hour: {len(df) / 24:.1f}")
print(f"    Average crimes per day: {len(df) / 7:.1f}")
print(f"    Average crimes per month: {len(df) / 12:.1f}")
print(f"    Average crimes per season: {len(df) / 4:.1f}")


## 4. Spatial Analysis - Geographic Crime Distribution

Understanding the spatial distribution of crimes helps identify:
- **Hot spots** - Areas with high crime concentration
- **Safe zones** - Areas with low crime rates
- **Geographical patterns** - Urban vs suburban crime patterns
- **Area-specific trends** - Different crime types in different areas
- **Coordinate analysis** - Latitude/longitude insights

### 4.1 Geographic Distribution Analysis

This analysis examines crime distribution across different areas of Los Angeles and provides spatial insights.


In [None]:
# Spatial Analysis - Geographic Crime Distribution
print("SPATIAL ANALYSIS - GEOGRAPHIC PATTERNS")
print("="*50)

# Basic spatial statistics
print("Geographic Coverage:")
print(f"   Latitude range: {df['LAT'].min():.4f} to {df['LAT'].max():.4f}")
print(f"   Longitude range: {df['LON'].min():.4f} to {df['LON'].max():.4f}")
print(f"   Unique coordinates: {df[['LAT', 'LON']].drop_duplicates().shape[0]:,}")

# Area analysis
print(f"\nArea Analysis:")
area_crimes = df['AREA NAME'].value_counts().head(10)
print("Top 10 Crime Areas:")
for i, (area, count) in enumerate(area_crimes.items(), 1):
    percentage = (count / len(df)) * 100
    print(f"{i:2d}. {area:<25}: {count:>6,} crimes ({percentage:>5.1f}%)")

# Create spatial visualizations
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Crime Spatial Distribution Analysis', fontsize=16, y=1.02)

# 1. Area distribution (top 15)
top_15_areas = df['AREA NAME'].value_counts().head(15)
axes[0, 0].barh(range(len(top_15_areas)), top_15_areas.values, color='lightblue', alpha=0.8)
axes[0, 0].set_yticks(range(len(top_15_areas)))
axes[0, 0].set_yticklabels(top_15_areas.index)
axes[0, 0].set_title('Top 15 Crime Areas')
axes[0, 0].set_xlabel('Number of Crimes')
axes[0, 0].grid(True, alpha=0.3)

sample_size = min(200000, len(df))
df_sample_viz = df.sample(n=sample_size, random_state=42)
scatter = axes[0, 1].scatter(df_sample_viz['LON'], df_sample_viz['LAT'], 
                           alpha=0.6, s=1, c='red')
axes[0, 1].set_title(f'Crime Locations Scatter Plot (Sample: {sample_size:,})')
axes[0, 1].set_xlabel('Longitude')
axes[0, 1].set_ylabel('Latitude')
axes[0, 1].grid(True, alpha=0.3)

# 3. Area vs Crime Type heatmap (top areas and crime types)
top_areas = df['AREA NAME'].value_counts().head(10).index
top_crimes = df['Crm Cd Desc'].value_counts().head(10).index
area_crime_matrix = pd.crosstab(df['AREA NAME'], df['Crm Cd Desc'])
area_crime_subset = area_crime_matrix.loc[top_areas, top_crimes]

sns.heatmap(area_crime_subset, annot=True, fmt='d', cmap='YlOrRd', 
            ax=axes[1, 0], cbar_kws={'shrink': 0.5})
axes[1, 0].set_title('Area vs Crime Type Heatmap (Top 10 each)')
axes[1, 0].set_xlabel('Crime Type')
axes[1, 0].set_ylabel('Area')
axes[1, 0].tick_params(axis='x', rotation=90)
axes[1, 0].tick_params(axis='y', rotation=0)

# 4. Crime density by coordinates (2D histogram)
axes[1, 1].hist2d(df_sample_viz['LON'], df_sample_viz['LAT'], bins=50, cmap='hot')
axes[1, 1].set_title('Crime Density Heatmap (2D Histogram)')
axes[1, 1].set_xlabel('Longitude')
axes[1, 1].set_ylabel('Latitude')

plt.tight_layout()
plt.show()

# Spatial insights
print(f"\nKEY SPATIAL INSIGHTS:")
print("-" * 30)
print(f"Highest crime area: {area_crimes.index[0]} ({area_crimes.iloc[0]:,} crimes)")
print(f"Lowest crime area: {df['AREA NAME'].value_counts().index[-1]} ({df['AREA NAME'].value_counts().iloc[-1]:,} crimes)")
print(f"Crime concentration: Top 5 areas account for {(area_crimes.head(5).sum()/len(df)*100):.1f}% of all crimes")
print(f"Geographic spread: {df['AREA NAME'].nunique()} distinct areas covered")

# Area statistics
area_stats = df.groupby('AREA NAME').size().describe()
print(f"\nArea Crime Statistics:")
print(f"   Mean crimes per area: {area_stats['mean']:.1f}")
print(f"   Median crimes per area: {area_stats['50%']:.1f}")
print(f"   Std deviation: {area_stats['std']:.1f}")
print(f"   Range: {area_stats['min']:.0f} - {area_stats['max']:.0f} crimes")


## 5. Crime Type Analysis - Understanding Criminal Activities

This section analyzes the different types of crimes and their characteristics:
- **Crime categories** - Most common vs rare crimes
- **Crime severity** - Part 1 vs Part 2 classifications
- **Crime patterns** - Which crimes occur together
- **Crime trends** - Changes in crime types over time
- **Crime-location relationships** - Where specific crimes happen

### 5.1 Crime Type Distribution and Severity Analysis

We examine the distribution of different crime types and analyze their severity classifications.


In [None]:
# Crime Type Analysis
print("CRIME TYPE ANALYSIS - UNDERSTANDING CRIMINAL ACTIVITIES")
print("="*60)

# Basic crime type statistics
print("Crime Type Overview:")
print(f"   Total unique crime types: {df['Crm Cd Desc'].nunique()}")
print(f"   Total unique crime codes: {df['Crm Cd'].nunique()}")
print(f"   Crime severity distribution:")
severity_dist = df['Part 1-2'].value_counts().sort_index()
for part, count in severity_dist.items():
    percentage = (count / len(df)) * 100
    severity_name = "Serious crimes (Part 1)" if part == 1 else "Less serious crimes (Part 2)"
    print(f"     Part {part} - {severity_name}: {count:,} ({percentage:.1f}%)")

# Top crime types analysis
print(f"\nTop 15 Crime Types:")
top_crimes = df['Crm Cd Desc'].value_counts().head(15)
for i, (crime, count) in enumerate(top_crimes.items(), 1):
    percentage = (count / len(df)) * 100
    print(f"{i:2d}. {crime:<50}: {count:>6,} ({percentage:>5.1f}%)")

# Crime type visualizations
fig, axes = plt.subplots(2, 2, figsize=(20, 14))
fig.suptitle('Crime Type Distribution Analysis', fontsize=16, y=1.02)

# 1. Top 15 crime types
top_15_crimes = df['Crm Cd Desc'].value_counts().head(15)
axes[0, 0].barh(range(len(top_15_crimes)), top_15_crimes.values, color='lightcoral', alpha=0.8)
axes[0, 0].set_yticks(range(len(top_15_crimes)))
axes[0, 0].set_yticklabels([crime[:30] + '...' if len(crime) > 30 else crime for crime in top_15_crimes.index])
axes[0, 0].set_title('Top 15 Crime Types')
axes[0, 0].set_xlabel('Number of Crimes')
axes[0, 0].grid(True, alpha=0.3)

# 2. Crime severity distribution
severity_labels = ['Part 1 (Serious)', 'Part 2 (Less Serious)']
severity_counts = [severity_dist[1], severity_dist[2]]
axes[0, 1].pie(severity_counts, labels=severity_labels, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Crime Severity Distribution')

# 3. Crime type distribution (log scale for better visualization)
crime_counts = df['Crm Cd Desc'].value_counts()
axes[1, 0].hist(crime_counts, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 0].set_title('Crime Type Frequency Distribution')
axes[1, 0].set_xlabel('Number of Crimes per Type')
axes[1, 0].set_ylabel('Number of Crime Types')
axes[1, 0].set_yscale('log')
axes[1, 0].grid(True, alpha=0.3)

# 4. Crime types by severity
part1_crimes = df[df['Part 1-2'] == 1]['Crm Cd Desc'].value_counts().head(10)
part2_crimes = df[df['Part 1-2'] == 2]['Crm Cd Desc'].value_counts().head(10)

y_pos = np.arange(len(part1_crimes))
axes[1, 1].barh(y_pos, part1_crimes.values, alpha=0.8, label='Part 1 (Serious)', color='red')
axes[1, 1].set_yticks(y_pos)
axes[1, 1].set_yticklabels([crime[:25] + '...' if len(crime) > 25 else crime for crime in part1_crimes.index])
axes[1, 1].set_title('Top 10 Part 1 (Serious) Crimes')
axes[1, 1].set_xlabel('Number of Crimes')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Crime type insights
print(f"\nKEY CRIME TYPE INSIGHTS:")
print("-" * 35)
print(f"Most common crime: {top_crimes.index[0]} ({top_crimes.iloc[0]:,} cases)")
print(f"Least common crime: {df['Crm Cd Desc'].value_counts().index[-1]} ({df['Crm Cd Desc'].value_counts().iloc[-1]:,} cases)")
print(f"Crime concentration: Top 10 crimes account for {(top_crimes.head(10).sum()/len(df)*100):.1f}% of all crimes")
print(f"Crime diversity: {df['Crm Cd Desc'].nunique()} different crime types")

# Class imbalance analysis
print(f"\nClass Imbalance Analysis:")
crime_counts_stats = df['Crm Cd Desc'].value_counts()
print(f"   Max crimes per type: {crime_counts_stats.max():,}")
print(f"   Min crimes per type: {crime_counts_stats.min():,}")
print(f"   Mean crimes per type: {crime_counts_stats.mean():.1f}")
print(f"   Median crimes per type: {crime_counts_stats.median():.1f}")
print(f"   Imbalance ratio: {crime_counts_stats.max() / crime_counts_stats.min():.1f}:1")

# Rare vs common crimes
rare_threshold = 100  # crimes with less than 100 occurrences
rare_crimes = crime_counts_stats[crime_counts_stats < rare_threshold]
common_crimes = crime_counts_stats[crime_counts_stats >= rare_threshold]

print(f"\nRare vs Common Crimes Analysis:")
print(f"   Rare crimes (<{rare_threshold} cases): {len(rare_crimes)} types ({len(rare_crimes)/len(crime_counts_stats)*100:.1f}%)")
print(f"   Common crimes (â‰¥{rare_threshold} cases): {len(common_crimes)} types ({len(common_crimes)/len(crime_counts_stats)*100:.1f}%)")
print(f"   Rare crimes total cases: {rare_crimes.sum():,} ({rare_crimes.sum()/len(df)*100:.1f}%)")
print(f"   Common crimes total cases: {common_crimes.sum():,} ({common_crimes.sum()/len(df)*100:.1f}%)")


## 6. Victim Demographics Analysis

Understanding victim demographics provides insights into:
- **Age patterns** - Which age groups are most vulnerable
- **Gender distribution** - Male vs female victim patterns
- **Descent analysis** - Ethnic and racial patterns
- **Demographic correlations** - How demographics relate to crime types
- **Vulnerability assessment** - Identifying at-risk populations

### 6.1 Basic Victim Demographics Overview

This section provides an overview of victim demographics across the dataset.


In [None]:
# Victim Demographics Analysis
print("VICTIM DEMOGRAPHICS ANALYSIS")
print("="*50)

print(f"Demographic Data Overview:")
print(f"   Total records: {len(df):,}")
print(f"   Age data: {df['Vict Age'].notna().sum():,} records")
print(f"   Gender data: {df['Vict Sex'].notna().sum():,} records")
print(f"   Descent data: {df['Vict Descent'].notna().sum():,} records")

# Age analysis 
print(f"\nAge Analysis:")
valid_ages = df[df['Vict Age'] > -5]['Vict Age']
print(f"   Valid age range: {valid_ages.min():.0f} - {valid_ages.max():.0f} years")
print(f"   Mean age: {valid_ages.mean():.1f} years")
print(f"   Median age: {valid_ages.median():.1f} years")
print(f"   Most common age: {valid_ages.mode().iloc[0]:.0f} years")

# Create age groups (handling special values)
def categorize_age(age):
    if age == -1:
        return 'Not Applicable'
    elif age < 0:
        return 'Invalid/Missing'
    elif age < 18:
        return 'Minor (0-17)'
    elif age < 25:
        return 'Young Adult (18-24)'
    elif age < 35:
        return 'Adult (25-34)'
    elif age < 50:
        return 'Middle Age (35-49)'
    elif age < 65:
        return 'Mature (50-64)'
    else:
        return 'Senior (65+)'

df['age_group'] = df['Vict Age'].apply(categorize_age)

# Gender analysis (including all categories)
print(f"\nGender Analysis:")
gender_dist = df['Vict Sex'].value_counts(dropna=False)
gender_mapping = {'M': 'Male', 'F': 'Female', 'H': 'Hetero', 'Unknown': 'Unknown'}
for gender, count in gender_dist.items():
    percentage = (count / len(df)) * 100
    gender_name = gender_mapping.get(gender, str(gender))
    print(f"   {gender_name}: {count:,} ({percentage:.1f}%)")

# Descent analysis (including all categories)
print(f"\nDescent Analysis:")
descent_dist = df['Vict Descent'].value_counts(dropna=False).head(10)
descent_mapping = {
    'H': 'Hispanic/Latino',
    'W': 'White',
    'B': 'Black',
    'O': 'Other',
    'A': 'Asian',
    'Unknown': 'Unknown',
    'K': 'Korean',
    'F': 'Filipino',
    'C': 'Chinese',
    'J': 'Japanese',
    'V': 'Vietnamese',
    'I': 'American Indian',
    'P': 'Pacific Islander',
    'G': 'Guamanian',
    'S': 'Samoan',
    'U': 'Hawaiian',
    'Z': 'Asian Indian',
    'L': 'Laotian',
    'T': 'Thai',
    'D': 'Cambodian'
}

print("Top 10 Victim Descent Groups:")
for i, (descent, count) in enumerate(descent_dist.items(), 1):
    percentage = (count / len(df)) * 100
    desc_name = descent_mapping.get(descent, str(descent))
    print(f"{i:2d}. {desc_name:<20}: {count:>6,} ({percentage:>5.1f}%)")

# Demographic visualizations
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Victim Demographics Analysis', fontsize=16, y=1.02)

# 1. Age distribution 
axes[0, 0].hist(valid_ages, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Age Distribution (Valid Ages)')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Number of Victims')
axes[0, 0].grid(True, alpha=0.3)

# 2. Age groups 
age_group_counts = df['age_group'].value_counts()
axes[0, 1].pie(age_group_counts.values, labels=age_group_counts.index, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Age Groups Distribution')

# 3. Gender distribution (including all categories)
gender_counts = df['Vict Sex'].value_counts(dropna=False)
gender_labels = [gender_mapping.get(x, str(x)) for x in gender_counts.index]
axes[0, 2].pie(gender_counts.values, labels=gender_labels, autopct='%1.1f%%', startangle=90)
axes[0, 2].set_title('Gender Distribution')

# 4. Top 10 descent groups
top_10_descent = df['Vict Descent'].value_counts(dropna=False).head(10)
descent_labels = [descent_mapping.get(d, str(d)) for d in top_10_descent.index]
axes[1, 0].barh(range(len(top_10_descent)), top_10_descent.values, color='lightcoral', alpha=0.8)
axes[1, 0].set_yticks(range(len(top_10_descent)))
axes[1, 0].set_yticklabels(descent_labels)
axes[1, 0].set_title('Top 10 Victim Descent Groups')
axes[1, 0].set_xlabel('Number of Victims')
axes[1, 0].grid(True, alpha=0.3)

# 5. Age vs Gender 
df_valid = df[(df['Vict Sex'].notna()) & (df['age_group'] != 'Invalid/Missing')]
age_gender_crosstab = pd.crosstab(df_valid['age_group'], df_valid['Vict Sex'])
age_gender_crosstab.plot(kind='bar', ax=axes[1, 1], alpha=0.8)
axes[1, 1].set_title('Age Groups by Gender')
axes[1, 1].set_xlabel('Age Group')
axes[1, 1].set_ylabel('Number of Victims')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# 6. Age distribution by gender 
df_male = df[(df['Vict Sex'] == 'M') & df['Vict Age'] ]
df_female = df[(df['Vict Sex'] == 'F') & df['Vict Age'] ]
if len(df_male) > 0:
    df_male['Vict Age'].hist(bins=30, alpha=0.7, label='Male', ax=axes[1, 2])
if len(df_female) > 0:
    df_female['Vict Age'].hist(bins=30, alpha=0.7, label='Female', ax=axes[1, 2])
axes[1, 2].set_title('Age Distribution by Gender')
axes[1, 2].set_xlabel('Age')
axes[1, 2].set_ylabel('Number of Victims')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Demographic insights
print(f"\nKEY DEMOGRAPHIC INSIGHTS:")
print("-" * 35)
most_common_age = valid_ages.mode().iloc[0]
most_common_gender_code = gender_dist.index[0]
most_common_gender = gender_mapping.get(most_common_gender_code, str(most_common_gender_code))
most_common_descent = descent_mapping.get(descent_dist.index[0], str(descent_dist.index[0]))
most_common_age_group = age_group_counts.index[0]

print(f"Most common victim age: {most_common_age:.0f} years")
print(f"Most common victim gender: {most_common_gender} ({gender_dist.iloc[0]:,} cases)")
print(f"Most common victim descent: {most_common_descent} ({descent_dist.iloc[0]:,} cases)")
print(f"Most common age group: {most_common_age_group} ({age_group_counts.iloc[0]:,} cases)")

# Statistical analysis
print(f"\nStatistical Analysis:")
if 'M' in gender_dist.index and 'F' in gender_dist.index:
    print(f"   Gender ratio (M:F): {gender_dist['M']/gender_dist['F']:.2f}:1")
print(f"   Age standard deviation: {valid_ages.std():.1f} years")
print(f"   Age IQR: {valid_ages.quantile(0.75) - valid_ages.quantile(0.25):.1f} years")
print(f"   Descent diversity: {df['Vict Descent'].nunique()} different groups")

# Age group statistics
print(f"\nAge Group Statistics:")
for age_group, count in age_group_counts.items():
    percentage = (count / len(df)) * 100
    print(f"   {age_group:<20}: {count:>6,} ({percentage:>5.1f}%)")


## 7. Correlation Analysis & Feature Relationships

This section explores relationships between different features to understand:
- **Numerical correlations** - How numerical features relate to each other
- **Categorical associations** - Relationships between categorical variables
- **Feature importance** - Which features are most predictive
- **Interaction effects** - How features work together
- **Multicollinearity** - Redundant features that might affect modeling

### 7.1 Statistical Correlation Analysis

We analyze correlations between numerical features and associations between categorical variables.


In [None]:
# Correlation Analysis & Feature Relationships
print("CORRELATION ANALYSIS & FEATURE RELATIONSHIPS")
print("="*55)

# Select numerical columns for correlation analysis
numerical_cols = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 
                  'Vict Age', 'Premis Cd', 'Weapon Used Cd', 'LAT', 'LON', 
                  'year', 'month', 'day', 'hour', 'minute']

# Filter to only include columns that exist in the dataframe
available_numerical_cols = [col for col in numerical_cols if col in df.columns]
print(f"Analyzing {len(available_numerical_cols)} numerical features")

# Calculate correlation matrix
correlation_matrix = df[available_numerical_cols].corr()

# Create correlation visualizations
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Feature Correlation Analysis', fontsize=16, y=1.02)

# 1. Full correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8}, ax=axes[0, 0])
axes[0, 0].set_title('Complete Correlation Matrix')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].tick_params(axis='y', rotation=0)

# 2. Strong correlations only (|correlation| > 0.3)
strong_corr_mask = np.abs(correlation_matrix) > 0.3
strong_corr = correlation_matrix.where(strong_corr_mask)
sns.heatmap(strong_corr, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8}, ax=axes[0, 1])
axes[0, 1].set_title('Strong Correlations (|r| > 0.3)')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].tick_params(axis='y', rotation=0)

# 3. Correlation with target variable (if using Part 1-2 as target)
if 'Part 1-2' in df.columns:
    target_corr = correlation_matrix['Part 1-2'].abs().sort_values(ascending=False)
    target_corr = target_corr.drop('Part 1-2')  # Remove self-correlation
    
    axes[1, 0].barh(range(len(target_corr)), target_corr.values, color='lightgreen', alpha=0.8)
    axes[1, 0].set_yticks(range(len(target_corr)))
    axes[1, 0].set_yticklabels(target_corr.index)
    axes[1, 0].set_title('Correlation with Crime Severity (Part 1-2)')
    axes[1, 0].set_xlabel('Absolute Correlation')
    axes[1, 0].grid(True, alpha=0.3)

# 4. Distribution of correlation values
corr_values = correlation_matrix.values
corr_values = corr_values[np.triu_indices_from(corr_values, k=1)]  # Upper triangle only
axes[1, 1].hist(corr_values, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 1].set_title('Distribution of Correlation Values')
axes[1, 1].set_xlabel('Correlation Coefficient')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find and display highly correlated pairs
print(f"\nHIGHLY CORRELATED FEATURE PAIRS:")
print("-" * 40)

high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.7:  # High correlation threshold
            high_corr_pairs.append((correlation_matrix.columns[i], 
                                   correlation_matrix.columns[j], 
                                   corr_value))

if high_corr_pairs:
    print("Strong correlations (|r| > 0.7):")
    for col1, col2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
        print(f"  {col1} â†” {col2}: {corr:.3f}")
else:
    print("No highly correlated pairs found (|r| > 0.7)")

# Moderate correlations
moderate_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if 0.3 < abs(corr_value) <= 0.7:  # Moderate correlation threshold
            moderate_corr_pairs.append((correlation_matrix.columns[i], 
                                      correlation_matrix.columns[j], 
                                      corr_value))

if moderate_corr_pairs:
    print(f"\nModerate correlations (0.3 < |r| â‰¤ 0.7):")
    for col1, col2, corr in sorted(moderate_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]:
        print(f"  {col1} â†” {col2}: {corr:.3f}")

# Categorical associations analysis
print(f"\n CATEGORICAL ASSOCIATIONS:")
print("-" * 30)

# Chi-square test for categorical independence
categorical_pairs = [
    ('AREA NAME', 'Crm Cd Desc'),
    ('time_category', 'Crm Cd Desc'),
    ('Vict Sex', 'Crm Cd Desc'),
    ('season', 'Crm Cd Desc'),
    ('day_name', 'Crm Cd Desc')
]

for cat1, cat2 in categorical_pairs:
    if cat1 in df.columns and cat2 in df.columns:
        # Create contingency table
        contingency_table = pd.crosstab(df[cat1], df[cat2])
        
        # Perform chi-square test
        try:
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
            
            # Calculate CramÃ©r's V (effect size)
            n = contingency_table.sum().sum()
            cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
            
            print(f"{cat1} vs {cat2}:")
            print(f"  Chi-square: {chi2:.2f}, p-value: {p_value:.2e}")
            print(f"  CramÃ©r's V: {cramers_v:.3f} ({'Strong' if cramers_v > 0.3 else 'Moderate' if cramers_v > 0.1 else 'Weak'} association)")
            
        except Exception as e:
            print(f"{cat1} vs {cat2}: Error in chi-square test")

# Summary statistics
print(f"\nCORRELATION SUMMARY:")
print("-" * 25)
print(f"  Total feature pairs analyzed: {len(corr_values)}")
print(f"  High correlations (|r| > 0.7): {len(high_corr_pairs)}")
print(f"  Moderate correlations (0.3 < |r| â‰¤ 0.7): {len(moderate_corr_pairs)}")
print(f"  Mean absolute correlation: {np.mean(np.abs(corr_values)):.3f}")
print(f"  Max absolute correlation: {np.max(np.abs(corr_values)):.3f}")
print(f"  Features with high variance: {len([col for col in available_numerical_cols if df[col].std() > df[col].mean()])}")

print(f"\n MULTICOLLINEARITY WARNING:")
print("-" * 30)
if len(high_corr_pairs) > 0:
    print("  High correlations detected! Consider:")
    print("  - Removing redundant features")
    print("  - Using PCA or other dimensionality reduction")
    print("  - Regularization techniques in modeling")
else:
    print("  No severe multicollinearity detected")
    print("  Features appear to be relatively independent")


## 8. Data Quality Assessment and Imputation

This section addresses data quality issues and implements appropriate imputation strategies:
- **Missing value patterns** - Understanding what data is missing and why
- **Logical imputation** - Using domain knowledge for intelligent imputation
- **Weapon usage analysis** - Categorizing crimes by weapon involvement
- **Victim age cleaning** - Handling invalid or inconsistent age values




### 8.1 Weapon Usage Classification and Imputation

We categorize crimes based on whether they typically involve weapons and use this information for intelligent imputation of missing weapon data.

In [None]:
from crimes_lists import weapon_crimes, non_weapon_crimes

print(f"   Weapon-expected crimes: {len(weapon_crimes)} types")
print(f"   Non-weapon crimes: {len(non_weapon_crimes)} types")
print(f"   Total classified: {len(weapon_crimes) + len(non_weapon_crimes)} types")
print(f"   Total unique crimes in dataset: {df['Crm Cd Desc'].nunique()}")

In [None]:
print("WEAPON IMPUTATION")
print("=" * 80)
print("Implementing logic-based imputation for 'Weapon Used Cd' column")
print()

all_crimes = set(df['Crm Cd Desc'].unique())
categorized_crimes = non_weapon_crimes.union(weapon_crimes)  

print(f"Total unique crimes in dataset: {len(all_crimes)}")
print(f"Crimes categorized: {len(categorized_crimes)}")
print(f"Coverage: {len(categorized_crimes)/len(all_crimes)*100:.1f}%")

#  Apply the imputation logic
# Create a copy of the dataframe for the imputation
df_imputed = df.copy()

# Count missing values before imputation
missing_before = df_imputed['Weapon Used Cd'].isna().sum()
print(f"Missing values before imputation: {missing_before:,} ({missing_before/len(df_imputed)*100:.1f}%)")

# Apply imputation logic
def impute_weapon_cd(row):
    if pd.notna(row['Weapon Used Cd']):
        return row['Weapon Used Cd']  
    
    crime_type = row['Crm Cd Desc']
    
    if crime_type in weapon_crimes:
        return 999.9  # For weapon crimes with missing weapon info
    else:
        return 111.1  # For non-weapon crimes or uncategorized crimes

# Apply the imputation
df_imputed['Weapon Used Cd'] = df_imputed.apply(impute_weapon_cd, axis=1)

# Count missing values after imputation
missing_after = df_imputed['Weapon Used Cd'].isna().sum()
print(f"Missing values after imputation: {missing_after:,} ({missing_after/len(df_imputed)*100:.1f}%)")

# Show imputation results
imputation_results = df_imputed['Weapon Used Cd'].value_counts()
print(f"\nImputation results:")
print(f"  â€¢ 'NO WEAPON USED': {imputation_results.get('NO WEAPON USED', 0):,} cases")
print(f"  â€¢ 'UNSPECIFIED': {imputation_results.get('UNSPECIFIED', 0):,} cases")
print(f"  â€¢ Original weapon codes: {len(imputation_results) - 2:,} different codes")

print()

# Update the main dataframe
df = df_imputed.copy()

print(f"Imputed {missing_before:,} missing values using crime-type logic")
print("=" * 80)


In [None]:
# Clean weapon imputation implementation
print("Weapon Imputation")

# Apply imputation logic
df_imputed = df.copy()
missing_before = df_imputed['Weapon Used Cd'].isna().sum()

def impute_weapon_cd(row):
    if pd.notna(row['Weapon Used Cd']):
        return row['Weapon Used Cd']
    
    crime_type = row['Crm Cd Desc']
    
    if crime_type in weapon_crimes:
        return 'UNSPECIFIED'
    else:
        return 'NO WEAPON USED'

df_imputed['Weapon Used Cd'] = df_imputed.apply(impute_weapon_cd, axis=1)
missing_after = df_imputed['Weapon Used Cd'].isna().sum()

print(f"\nImputation results:")
print(f"Missing values before: {missing_before:,} ({missing_before/len(df_imputed)*100:.1f}%)")
print(f"Missing values after: {missing_after:,}")

imputation_results = df_imputed['Weapon Used Cd'].value_counts()
print(f"'NO WEAPON USED': {imputation_results.get('NO WEAPON USED', 0):,} cases")
print(f"'UNSPECIFIED': {imputation_results.get('UNSPECIFIED', 0):,} cases")

# Update main dataframe
df = df_imputed.copy()
print("Weapon imputation completed")


In [None]:
print("Applying the same logic to 'Weapon Desc' column for consistency")

# Apply the same logic to 'Weapon Desc' column
def impute_weapon_desc(row):
    if pd.notna(row['Weapon Desc']):
        return row['Weapon Desc']  
    
    crime_type = row['Crm Cd Desc']
    
    if crime_type in weapon_crimes:
        return 'UNSPECIFIED'  
    else:
        return 'NO WEAPON USED' 

# Apply the imputation to Weapon Desc
df['Weapon Desc'] = df.apply(impute_weapon_desc, axis=1)


In [None]:
# Create a is_used_weapon to indicate if the crime involved a weapon
df['is_used_weapon'] = (df['Weapon Desc'] != 'NO WEAPON USED').astype(int)

### 8.2 Victim Demographics Imputation

This section implements intelligent imputation for victim sex and descent information:
- Converting coded missing values to standardized formats
- Using crime-type-based imputation for missing demographics
- Ensuring data consistency across victim characteristics


In [None]:
def clean_victim_age(df):
   
    df['Vict Age'] = df['Vict Age'].replace(range(-5, 1), -1)    
    return df

df = clean_victim_age(df)

In [None]:
from crimes_lists import human_victim_crimes, not_human_victim_crimes

def categorize_crime_victim_type(crime_desc):
  
    crime_desc_upper = crime_desc.upper()
    
    # Check for human victim crimes
    for crime_type in human_victim_crimes:
        if crime_type.upper() in crime_desc_upper:
            return 1
    
    # Check for non-human victim crimes
    for crime_type in not_human_victim_crimes:
        if crime_type.upper() in crime_desc_upper:
            return 0
    
    # Default to human victim if unclear
    return 'human_victim'

def fill_vict_age_outliers(df):
    """
    Fill Vict Age outliers (-4 to 0) based on crime type:
    - Human victim crimes: fill with -5 (for future appropriate age filling)
    - Non-human victim crimes: fill with -1 (not applicable)
    """
    # Create a copy to avoid modifying original
    df_cleaned = df.copy()
    
    # Categorize crimes by victim type
    df_cleaned['is_human_victim'] = df_cleaned['Crm Cd Desc'].apply(categorize_crime_victim_type)
    
    # Identify outliers (ages -4 to 0)
    outlier_mask = (df_cleaned['Vict Age'] >= -4) & (df_cleaned['Vict Age'] <= 0)
   
    # Fill outliers based on crime type
    # Human victim crimes: fill with -5
    human_victim_outliers = outlier_mask & (df_cleaned['is_human_victim'] == 1)
    df_cleaned.loc[human_victim_outliers, 'Vict Age'] = -5
    
    # Non-human victim crimes: fill with -1
    not_human_victim_outliers = outlier_mask & (df_cleaned['is_human_victim'] == 0)
    df_cleaned.loc[not_human_victim_outliers, 'Vict Age'] = -1
            
    return df_cleaned

# Apply the function
df_cleaned = fill_vict_age_outliers(df)
df = df_cleaned

In [None]:
# Replace -5 values with most frequent age by crime type
# Get crime types that have -5 values
crimes_with_minus_five = df[df['Vict Age'] == -5]['Crm Cd Desc'].value_counts()
print(f"\nCrime types with -5 values:")
print(crimes_with_minus_five)
    
# For each crime type with -5 values, find the most frequent age (excluding -5 and -1)
replacement_ages = {}
    
    
for crime_type in crimes_with_minus_five.index:
    # Get all ages for this crime type, excluding -5 and -1
    valid_ages = df[(df['Crm Cd Desc'] == crime_type) & 
                    (df['Vict Age'] > 0)]['Vict Age']
        
    if len(valid_ages) > 0:
        # Get the most frequent age (mode)
        mode_age = valid_ages.mode()
        if len(mode_age) > 0:
            replacement_age = mode_age.iloc[0]
            replacement_ages[crime_type] = replacement_age
                
        else:
            # If no mode found, use median
            replacement_age = int(valid_ages.median())
            replacement_ages[crime_type] = replacement_age

    else:
        # If no valid ages found, use overall dataset mode
        overall_mode = df[df['Vict Age'] > 0]['Vict Age'].mode()
        if len(overall_mode) > 0:
            replacement_age = overall_mode.iloc[0]
            replacement_ages[crime_type] = replacement_age
        
    # Apply the replacements
    for crime_type, replacement_age in replacement_ages.items():
        mask = (df['Crm Cd Desc'] == crime_type) & (df['Vict Age'] == -5)
        count_replaced = mask.sum()
        df.loc[mask, 'Vict Age'] = replacement_age
     
    

In [None]:
# Comprehensive victim demographics imputation
def handle_victim_demographics(df):
    df_processed = df.copy()
    
    # Step 1: Handle X values
    df_processed['Vict Sex'] = df_processed['Vict Sex'].replace('X', 'Unknown')
    df_processed['Vict Descent'] = df_processed['Vict Descent'].replace('X', 'Unknown')
    
    # Step 2: Crime-type-based imputation for missing values
    # Get the most common demographics per crime type
    crime_demographics = df_processed.groupby('Crm Cd Desc').agg({
        'Vict Sex': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and not pd.isna(x.mode().iloc[0]) else 'Unknown',
        'Vict Descent': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and not pd.isna(x.mode().iloc[0]) else 'Unknown'
    }).to_dict()
    
    # Apply imputation
    for idx, row in df_processed.iterrows():
        if pd.isna(row['Vict Sex']):
            crime_type = row['Crm Cd Desc']
            df_processed.at[idx, 'Vict Sex'] = crime_demographics['Vict Sex'].get(crime_type, 'Unknown')
        
        if pd.isna(row['Vict Descent']):
            crime_type = row['Crm Cd Desc']
            df_processed.at[idx, 'Vict Descent'] = crime_demographics['Vict Descent'].get(crime_type, 'Unknown')
    
    # Step 3: Final cleanup - any remaining NaN to 'Unknown'
    df_processed['Vict Sex'].fillna('Unknown', inplace=True)
    df_processed['Vict Descent'].fillna('Unknown', inplace=True)
    
    return df_processed

# Apply the processing
df_processed = handle_victim_demographics(df)

df = df_processed

### 8.3 Date and Time Formatting

Standardizing date and time formats for consistency and proper analysis.


In [None]:
df['Date Rptd'] = pd.to_datetime(df['Date Rptd'], errors='coerce')
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce')

# Make sure that TIME OCC is a 4-character string.
df['TIME_OCC_PAD'] = df['TIME OCC'].astype(str).str.zfill(4)

# Convert to 24-hour time
# df['TIME_OCC_24'] = pd.to_datetime(df['TIME_OCC_PAD'], format='%H%M').dt.time
df['TIME_OCC_24'] = pd.to_datetime(df['TIME_OCC_PAD'], format='%H%M', errors='coerce').dt.strftime('%H:%M')

df['TIME OCC'] = df['TIME_OCC_24']
df.drop(columns='TIME_OCC_24', inplace=True)

### 8.4 Premises Data Quality Assessment

Analyzing and handling missing premises information to maintain data completeness.


In [None]:
df.duplicated().sum()

In [None]:
df_no_id = df.drop(columns=['DR_NO'])
df_no_id.duplicated().sum()

In [None]:
df = df.drop_duplicates(subset=df.columns.difference(['DR_NO']))
df.duplicated().sum()

In [None]:
missing_premis_cd = df['Premis Cd'].isna()
missing_premis_desc = df['Premis Desc'].isna()

combined_missing = pd.DataFrame({
    'Premis Cd Missing': missing_premis_cd,
    'Premis Desc Missing': missing_premis_desc
})

summary = combined_missing.value_counts().reset_index()
summary.columns = ['Premis Cd Missing', 'Premis Desc Missing', 'Count']
print(summary)

In [None]:
premis_map = df.groupby('Premis Cd')['Premis Desc']\
               .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()

df['Premis Desc'] = df.apply(
    lambda row: premis_map.get(row['Premis Cd']) if pd.isna(row['Premis Desc']) else row['Premis Desc'],
    axis=1
)

In [None]:
missing_premis_cd = df['Premis Cd'].isna()
missing_premis_desc = df['Premis Desc'].isna()

combined_missing = pd.DataFrame({
    'Premis Cd Missing': missing_premis_cd,
    'Premis Desc Missing': missing_premis_desc
})

summary = combined_missing.value_counts().reset_index()
summary.columns = ['Premis Cd Missing', 'Premis Desc Missing', 'Count']
print(summary)

In [None]:
# Codes for which the 'Premis Desc'is missing
premis_cd_for_missing_desc = [256.0, 418.0, 975.0, 974.0, 976.0]

# Extract rows where 'Premis Cd' is in the list above AND 'Premis Desc' is available
matches = df[
    df['Premis Cd'].isin(premis_cd_for_missing_desc) &
    df['Premis Desc'].notna()
]

matches[['Premis Cd', 'Premis Desc']].drop_duplicates().sort_values('Premis Cd')


In [None]:
# Extract unique 'Premis Cd' values where 'Premis Desc' is missing
premis_cd_for_missing_desc = df[df['Premis Desc'].isna() & df['Premis Cd'].notna()]['Premis Cd'].dropna().unique()

pd.DataFrame(premis_cd_for_missing_desc, columns=['Premis Cd (Missing Desc)'])

In [None]:
mask_to_drop = ((missing_premis_cd == False) & (missing_premis_desc == True)) | \
               ((missing_premis_cd == True) & (missing_premis_desc == True))

df = df[~mask_to_drop].copy()

### 8.5 Modus Operandi (Mocodes) Analysis and Imputation

Analyzing the relationship between Mocodes and other features, and implementing intelligent imputation for missing values.


In [None]:
# Extract the number of codes in each 'Mocodes' field
df['Mocodes_Count'] = df['Mocodes'].fillna('').apply(lambda x: len(x.strip().split()))

numeric_df = df.select_dtypes(include=['int64', 'float64']).copy()
numeric_df['Mocodes_Count'] = df['Mocodes_Count']

correlation_with_mocodes = numeric_df.corr()['Mocodes_Count'].sort_values(ascending=False)

correlation_with_mocodes = correlation_with_mocodes.drop('Mocodes_Count')
print(correlation_with_mocodes)



In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))

categorical_columns = df.select_dtypes(include='object').columns.tolist()

# Exclude long text fields or date-related columns
categorical_columns = [col for col in categorical_columns if col not in ['Date Rptd', 'DATE OCC', 'LOCATION', 'Cross Street', 'Mocodes']]


results = {}
for col in categorical_columns:
    try:
        v = cramers_v(df['Mocodes_Count'], df[col])
        results[col] = v
    except:
        pass

pd.Series(results).sort_values(ascending=False)


In [None]:
# Create a dictionary of the most frequent Mocodes for each crime type
mocode_map = (
    df.groupby('Crm Cd Desc')['Mocodes']
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    .to_dict()
)

# Fill missing Mocodes values based on the crime type
df['Mocodes'] = df.apply(
    lambda row: mocode_map[row['Crm Cd Desc']] if pd.isna(row['Mocodes']) else row['Mocodes'],
    axis=1
)

df['Mocodes'] = df['Mocodes'].fillna('noMocode')

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

correlation_matrix = numeric_df.corr()

### 8.6 Outlier Detection and Analysis

Identifying and analyzing outliers in numerical features using statistical methods.


In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# IQR
def detect_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

outlier_summary = {}
for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    outlier_summary[col] = {
        'outlier_count': outliers.shape[0],
        'percentage': round((outliers.shape[0] / df.shape[0]) * 100, 2)
    }

outlier_df = pd.DataFrame(outlier_summary).T.sort_values(by='outlier_count', ascending=False)
print("outliers value")
print(outlier_df)


In [None]:

for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()


In [None]:
print("LAT range:", df['LAT'].min(), "-", df['LAT'].max())
print("LON range:", df['LON'].min(), "-", df['LON'].max())

In [None]:
df = df[
    (df['LAT'] != 0.0) &
    (df['LON'] != 0.0)
]

In [None]:
# IQR
def detect_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

outlier_summary = {}
for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    outlier_summary[col] = {
        'outlier_count': outliers.shape[0],
        'percentage': round((outliers.shape[0] / df.shape[0]) * 100, 2)
    }

outlier_df = pd.DataFrame(outlier_summary).T.sort_values(by='outlier_count', ascending=False)
print("outliers value")
print(outlier_df)

In [None]:
print("LAT range:", df['LAT'].min(), "-", df['LAT'].max())
print("LON range:", df['LON'].min(), "-", df['LON'].max())

### 8.7 Skewness

In [None]:
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=df, x=col, bins=30, kde=True)
    plt.title(f'Histogram for {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()


In [None]:
skew_values = df.select_dtypes(include=['int64', 'float64']).skew().sort_values(ascending=False)

print(skew_values)

In [None]:
# Columns with high skewness (|skew| > 0.75)
df['Mocodes_Count'] = np.log1p(df['Mocodes_Count'])        # Right-skewed â†’ log1p
df['Weapon Used Cd'] = np.cbrt(df['Weapon Used Cd'])       # Left-skewed â†’ cbrt
df['LON'] = np.cbrt(df['LON'])                             # Close to left-skewed â†’ cbrt

#  moderately skewed (|skew| between 0.5 and 0.75)
df['Premis Cd'] = np.log1p(df['Premis Cd'])                
df['Crm Cd'] = np.log1p(df['Crm Cd'])

In [None]:
skew_values = df.select_dtypes(include=['int64', 'float64']).skew().sort_values(ascending=False)
print(skew_values)

In [None]:
def calculate_weighted_risk_score(df):
    """Calculate weighted risk score for each area"""
    
    # Convert DATE OCC to datetime
    df['date_occurred'] = pd.to_datetime(df['DATE OCC'])
    max_date = df['date_occurred'].max()
    
    # Calculate days since crime occurred
    df['days_ago'] = (max_date - df['date_occurred']).dt.days
    
    # Calculate recency weight (exponential decay)
    df['recency_weight'] = np.exp(-df['days_ago'] / 365)
    
    # Calculate severity weight (Part 1 crimes weighted 2x)
    df['severity_weight'] = df['Part 1-2'].map({1: 2.0, 2: 1.0})
    
    # Calculate weighted crime score for each crime
    df['crime_score'] = df['recency_weight'] * df['severity_weight']
    
    # Aggregate by area
    area_risk_scores = df.groupby('AREA NAME').agg({
        'DR_NO': 'count',
        'crime_score': 'sum',
        'Part 1-2': lambda x: (x == 1).sum(),
        'days_ago': 'mean'
    }).reset_index()
    
    area_risk_scores.columns = ['AREA NAME', 'crime_count', 'weighted_score', 'serious_crimes', 'avg_days_ago']
    
    # Calculate serious crime ratio
    area_risk_scores['serious_crime_ratio'] = area_risk_scores['serious_crimes'] / area_risk_scores['crime_count']
    
    # Calculate final risk score (normalized)
    area_risk_scores['risk_score'] = (
        0.7 * (area_risk_scores['weighted_score'] / area_risk_scores['weighted_score'].max()) +
        0.3 * area_risk_scores['serious_crime_ratio']
    )
    
    # Scale risk score to 0-100
    area_risk_scores['risk_score'] = area_risk_scores['risk_score'] * 100
    
    return area_risk_scores

def add_area_risk_levels(df):
    """Add risk level columns based on weighted risk scores"""
    
    # Calculate risk scores
    area_risk_scores = calculate_weighted_risk_score(df)
    
    # Sort by risk score
    area_risk_scores = area_risk_scores.sort_values('risk_score', ascending=False)
    
    # Create risk level categories using quartiles
    area_risk_scores['risk_level'] = pd.qcut(
        area_risk_scores['risk_score'], 
        q=4, 
        labels=['Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk']
    )
    
    # Create numeric risk level
    area_risk_scores['risk_level_numeric'] = pd.qcut(
        area_risk_scores['risk_score'], 
        q=4, 
        labels=[1, 2, 3, 4]
    ).astype(int)
    
    # Create mapping dictionaries
    risk_mapping = dict(zip(area_risk_scores['AREA NAME'], area_risk_scores['risk_level']))
    risk_numeric_mapping = dict(zip(area_risk_scores['AREA NAME'], area_risk_scores['risk_level_numeric']))
    risk_score_mapping = dict(zip(area_risk_scores['AREA NAME'], area_risk_scores['risk_score']))
    
    # Add risk level columns to original dataframe
    df['area_risk_level'] = df['AREA NAME'].map(risk_mapping)
    df['area_risk_level_numeric'] = df['AREA NAME'].map(risk_numeric_mapping)
    df['area_risk_score'] = df['AREA NAME'].map(risk_score_mapping)
    
    # Drop temporary columns if they exist
    temp_cols = ['date_occurred', 'days_ago', 'recency_weight', 'severity_weight', 'crime_score']
    existing_temp_cols = [col for col in temp_cols if col in df.columns]
    if existing_temp_cols:
        df = df.drop(columns=existing_temp_cols)
    
    return df, area_risk_scores

# Execute the analysis
print("\n" + "="*60)
print("CALCULATING WEIGHTED AREA RISK LEVELS")
print("="*60)

df, area_statistics = add_area_risk_levels(df.copy())

# Display results
print("\nðŸ”¥ Top 10 Highest Risk Areas:")
print("-" * 80)
print(f"{'Area':<20} {'Risk Score':<12} {'Crimes':<10} {'Serious %':<12} {'Risk Level':<15}")
print("-" * 80)

for _, row in area_statistics.head(10).iterrows():
    print(f"{row['AREA NAME']:<20} {row['risk_score']:>10.2f} {row['crime_count']:>10,} "
          f"{row['serious_crime_ratio']*100:>10.1f}% {row['risk_level']:<15}")




### 8.8 Final Data Export

Saving the cleaned and processed dataset for further analysis and modeling.


In [None]:
# Save results
output_filename = 'data/cleaned_crime_data.csv'
df.to_csv(output_filename, index=False)
print(f"\nâœ… Enhanced dataset saved as: {output_filename}")

area_stats_filename = 'area_weighted_risk_statistics.csv'
area_statistics.to_csv(area_stats_filename, index=False)
print(f"âœ… Area statistics saved as: {area_stats_filename}")

print(f"\nðŸ“Š Process completed! Added 3 new columns:")
print("- area_risk_level (categorical)")
print("- area_risk_level_numeric (numeric 1-4)")  
print("- area_risk_score (raw score 0-100)")

In [None]:
df.columns.tolist()