## 1. Setup and Data Loading

Import necessary libraries and load the Los Angeles crime dataset.

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [16]:
# Load the dataset
df = pd.read_csv('Crime_Data_from_2020_to_Present.csv')

print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"\nColumn names:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
df.head()

print(f"\n10 Random rows:")
df.sample(n=10, random_state=42)

Dataset loaded: 1,004,991 rows, 28 columns

Column names:
['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON']

First few rows:

10 Random rows:


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
129061,200216570,10/23/2020 12:00:00 AM,10/01/2020 12:00:00 AM,2000,2,Rampart,249,2,956,"LETTERS, LEWD - TELEPHONE CALLS, LEWD",...,AA,Adult Arrest,956.0,,,,600 ST PAUL AV,,34.0542,-118.2611
324085,211009787,07/04/2021 12:00:00 AM,07/03/2021 12:00:00 AM,2250,10,West Valley,1011,2,354,THEFT OF IDENTITY,...,AA,Adult Arrest,354.0,,,,ARMINTA,WOODLEY,34.2047,-118.5531
120194,201225286,11/26/2020 12:00:00 AM,11/26/2020 12:00:00 AM,1610,12,77th Street,1241,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,IC,Invest Cont,230.0,,,,6400 WEST BL,,33.981,-118.3352
738250,231614576,12/01/2023 12:00:00 AM,10/04/2023 12:00:00 AM,900,16,Foothill,1657,1,440,THEFT PLAIN - PETTY ($950 & UNDER),...,IC,Invest Cont,440.0,,,,10100 HILLHAVEN AV,,34.2523,-118.2898
494997,221007431,04/01/2022 12:00:00 AM,03/31/2022 12:00:00 AM,1830,10,West Valley,1004,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),...,IC,Invest Cont,420.0,,,,18300 ROSCOE BL,,34.2208,-118.5317
973179,240207085,02/29/2024 12:00:00 AM,02/29/2024 12:00:00 AM,2100,2,Rampart,275,2,626,INTIMATE PARTNER - SIMPLE ASSAULT,...,IC,Invest Cont,626.0,,,,1600 W 11TH ST,,34.0506,-118.2769
387172,210513939,09/21/2021 12:00:00 AM,09/21/2021 12:00:00 AM,1338,5,Harbor,521,2,930,CRIMINAL THREATS - NO WEAPON DISPLAYED,...,IC,Invest Cont,930.0,,,,1800 N TAPER AV,,33.7614,-118.2989
254314,210213404,08/18/2021 12:00:00 AM,08/18/2021 12:00:00 AM,1400,2,Rampart,233,2,626,INTIMATE PARTNER - SIMPLE ASSAULT,...,IC,Invest Cont,626.0,,,,2700 BEVERLY BL,,34.0699,-118.2777
755729,231013989,08/14/2023 12:00:00 AM,08/08/2023 12:00:00 AM,1400,10,West Valley,1005,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,...,IC,Invest Cont,331.0,,,,7600 HESPERIA AV,,34.2084,-118.5263
975464,240613742,12/03/2024 12:00:00 AM,11/29/2024 12:00:00 AM,1624,6,Hollywood,637,1,440,THEFT PLAIN - PETTY ($950 & UNDER),...,IC,Invest Cont,440.0,,,,6200 HOLLYWOOD BL,,34.103,-118.3225


## 2. Data Exploration and Cleaning

Explore the data structure, check for missing values, and perform basic data cleaning.

In [17]:
# Basic data information
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nMissing values:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
missing_summary[missing_summary['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

Dataset Info:
Shape: (1004991, 28)

Missing values:


Unnamed: 0,Missing Count,Percentage
Crm Cd 4,1004927,99.993632
Crm Cd 3,1002677,99.769749
Crm Cd 2,935831,93.118346
Cross Street,850755,84.652997
Weapon Desc,677744,67.437818
Weapon Used Cd,677744,67.437818
Mocodes,151619,15.086603
Vict Descent,144656,14.393761
Vict Sex,144644,14.392567
Premis Desc,588,0.058508


In [18]:
# SMART DATA CLEANING - Core Features Strategy
print("="*80)
print("üìä SMART DATA CLEANING STRATEGY")
print("="*80)
print(f"üóÇÔ∏è  ORIGINAL DATASET: {len(df):,} total records")

# Define core features essential for predictive policing
core_features_required = {
    'district': ['AREA', 'AREA NAME'],    # At least one district identifier
    'time': ['DATE OCC'],                 # Crime occurrence date
    'crime_type': ['Crm Cd Desc']         # Crime description
}

print(f"\nüéØ CORE FEATURES REQUIRED:")
print(f"   District: Need at least one of {core_features_required['district']}")
print(f"   Time: {core_features_required['time'][0]} (when crime occurred)")
print(f"   Crime Type: {core_features_required['crime_type'][0]} (what happened)")

# Analyze missing data for core features
print(f"\nüìâ MISSING DATA ANALYSIS:")

# District missing (need at least one district identifier)
district_missing = (df['AREA'].isnull() & df['AREA NAME'].isnull())
print(f"   District info missing: {district_missing.sum():>8,} records ({district_missing.sum()/len(df)*100:>5.1f}%)")

# Time missing
time_missing = df['DATE OCC'].isnull()
print(f"   Occurrence date missing: {time_missing.sum():>6,} records ({time_missing.sum()/len(df)*100:>5.1f}%)")

# Crime type missing
crime_type_missing = df['Crm Cd Desc'].isnull()
print(f"   Crime type missing: {crime_type_missing.sum():>9,} records ({crime_type_missing.sum()/len(df)*100:>5.1f}%)")

# Combined core missing (remove only these)
core_missing = district_missing | time_missing | crime_type_missing
print(f"   ANY core feature missing: {core_missing.sum():>5,} records ({core_missing.sum()/len(df)*100:>5.1f}%)")

# Optional features analysis (don't remove for these)
print(f"\nüìç OPTIONAL FEATURES ANALYSIS:")
coord_missing = df['LAT'].isnull() | df['LON'].isnull()
coord_invalid = (
    (df['LAT'] == 0) | (df['LON'] == 0) |
    (df['LAT'] < 33) | (df['LAT'] > 35) |
    (df['LON'] > -117) | (df['LON'] < -119)
)
time_occ_missing = df['TIME OCC'].isnull()

print(f"   Coordinates missing: {coord_missing.sum():>10,} records ({coord_missing.sum()/len(df)*100:>5.1f}%)")
print(f"   Coordinates invalid: {coord_invalid.sum():>10,} records ({coord_invalid.sum()/len(df)*100:>5.1f}%)")
print(f"   Exact time missing: {time_occ_missing.sum():>11,} records ({time_occ_missing.sum()/len(df)*100:>5.1f}%)")

# Show what we'd lose with old vs new strategy
old_strategy_removed = coord_missing | coord_invalid
old_kept = len(df) - old_strategy_removed.sum()
new_kept = len(df) - core_missing.sum()

print(f"\nüìä STRATEGY COMPARISON:")
print(f"   Old Strategy (Remove missing coordinates):")
print(f"     Records kept: {old_kept:>12,} ({old_kept/len(df)*100:>5.1f}%)")
print(f"     Records lost: {old_strategy_removed.sum():>12,} ({old_strategy_removed.sum()/len(df)*100:>5.1f}%)")
print(f"\n   New Strategy (Keep core features):")
print(f"     Records kept: {new_kept:>12,} ({new_kept/len(df)*100:>5.1f}%)")
print(f"     Records lost: {core_missing.sum():>12,} ({core_missing.sum()/len(df)*100:>5.1f}%)")
print(f"\n   üìà IMPROVEMENT: {new_kept - old_kept:>12,} additional records ({(new_kept - old_kept)/len(df)*100:>5.1f}% more data)")

# Apply the smart cleaning strategy
print(f"\nüßπ APPLYING SMART CLEANING:")
print(f"   ‚úÖ Keeping records with missing coordinates but complete core features")
print(f"   ‚úÖ Keeping records with missing exact time but have date")
print(f"   ‚ùå Removing ONLY records missing essential crime information")

# Remove only records missing core features
df_clean = df[~core_missing].copy()
print(f"   Removed: {core_missing.sum():,} records missing core features")
print(f"   Retained: {len(df_clean):,} records ({len(df_clean)/len(df)*100:.1f}%)")

# Create missing data indicators as features (valuable information!)
print(f"\nüîß CREATING MISSING DATA INDICATORS:")

# Coordinate availability
df_clean['has_coordinates'] = ~(df_clean['LAT'].isnull() | df_clean['LON'].isnull())
df_clean['has_valid_coordinates'] = (
    df_clean['has_coordinates'] & 
    (df_clean['LAT'] != 0) & (df_clean['LON'] != 0) &
    (df_clean['LAT'] > 33) & (df_clean['LAT'] < 35) &
    (df_clean['LON'] > -119) & (df_clean['LON'] < -117)
)

# Time availability
df_clean['has_exact_time'] = ~df_clean['TIME OCC'].isnull()

# Data completeness score
df_clean['data_completeness_score'] = (
    df_clean['has_valid_coordinates'].astype(int) +
    df_clean['has_exact_time'].astype(int) +
    (~df_clean['Vict Age'].isnull()).astype(int)
) / 3

print(f"   ‚úÖ has_coordinates: {df_clean['has_coordinates'].sum():,} records")
print(f"   ‚úÖ has_valid_coordinates: {df_clean['has_valid_coordinates'].sum():,} records")
print(f"   ‚úÖ has_exact_time: {df_clean['has_exact_time'].sum():,} records")

# Remove duplicates (still important)
initial_clean_count = len(df_clean)
df_clean = df_clean.drop_duplicates()
duplicates_removed = initial_clean_count - len(df_clean)
print(f"\nüîÑ DUPLICATE REMOVAL:")
print(f"   Removed: {duplicates_removed:,} duplicate records")
print(f"   Final dataset: {len(df_clean):,} records")

# Show what types of records we're keeping that would have been lost
records_saved = (coord_missing | coord_invalid) & ~core_missing
if records_saved.sum() > 0:
    print(f"\nüíé VALUABLE RECORDS SAVED: {records_saved.sum():,}")
    print(f"   These records have:")
    print(f"   ‚úÖ District information ‚Üí Can do district-level analysis")
    print(f"   ‚úÖ Crime occurrence date ‚Üí Can do temporal analysis") 
    print(f"   ‚úÖ Crime type ‚Üí Can do crime category analysis")
    print(f"   ‚ùå Missing/invalid coordinates ‚Üí Will impute with district centroids")
    
    # Show examples
    saved_examples = df[records_saved].head(3)
    print(f"\n   üìù Examples of saved records:")
    for idx, row in saved_examples.iterrows():
        print(f"      Crime: {str(row.get('Crm Cd Desc', 'Unknown'))[:40]}")
        print(f"      District: {row.get('AREA NAME', 'Unknown')}")
        print(f"      Date: {row.get('DATE OCC', 'Unknown')}")
        print(f"      Coords: LAT={row.get('LAT', 'Missing')}, LON={row.get('LON', 'Missing')}")
        print(f"      ---")

# Update df to cleaned version
df = df_clean.copy()

print(f"\n‚úÖ SMART CLEANING COMPLETE!")
print(f"üìä FINAL DATASET: {len(df):,} records")
print(f"üìà DATA RETENTION: {len(df)/len(df_clean)*100:.1f}% of original data")
print(f"üéØ STRATEGY: Focus on core features, handle missing data intelligently")
print(f"üí° NEXT: Will impute missing coordinates using district averages")

# Summary statistics
print(f"\nüìã CLEANED DATASET SUMMARY:")
print(f"   Records with valid coordinates: {df['has_valid_coordinates'].sum():,} ({df['has_valid_coordinates'].mean()*100:.1f}%)")
print(f"   Records with exact time: {df['has_exact_time'].sum():,} ({df['has_exact_time'].mean()*100:.1f}%)")
print(f"   Records with complete data: {(df['data_completeness_score'] == 1).sum():,} ({(df['data_completeness_score'] == 1).mean()*100:.1f}%)")
print(f"   Average data completeness: {df['data_completeness_score'].mean():.3f}")

üìä SMART DATA CLEANING STRATEGY
üóÇÔ∏è  ORIGINAL DATASET: 1,004,991 total records

üéØ CORE FEATURES REQUIRED:
   District: Need at least one of ['AREA', 'AREA NAME']
   Time: DATE OCC (when crime occurred)
   Crime Type: Crm Cd Desc (what happened)

üìâ MISSING DATA ANALYSIS:
   District info missing:        0 records (  0.0%)
   Occurrence date missing:      0 records (  0.0%)
   Crime type missing:         0 records (  0.0%)
   ANY core feature missing:     0 records (  0.0%)

üìç OPTIONAL FEATURES ANALYSIS:
   Coordinates missing:          0 records (  0.0%)
   Coordinates invalid:      2,240 records (  0.2%)
   Exact time missing:           0 records (  0.0%)

üìä STRATEGY COMPARISON:
   Old Strategy (Remove missing coordinates):
     Records kept:    1,002,751 ( 99.8%)
     Records lost:        2,240 (  0.2%)

   New Strategy (Keep core features):
     Records kept:    1,004,991 (100.0%)
     Records lost:            0 (  0.0%)

   üìà IMPROVEMENT:        2,240 additional

## 3. Feature Engineering

Create time-based and categorical features that will be available before a crime occurs (no data leakage).

In [19]:
# Parse date and time information
df['Date Rptd'] = pd.to_datetime(df['Date Rptd'], errors='coerce')
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce')

# Extract time features from occurrence date
df['year'] = df['DATE OCC'].dt.year
df['month'] = df['DATE OCC'].dt.month
df['day_of_week'] = df['DATE OCC'].dt.dayofweek

# CORRECTED: Extract hour from TIME OCC (format is HHMM, e.g., 1230 = 12:30)
# Convert TIME OCC to numeric and divide by 100 to get the hour
df['TIME_OCC_numeric'] = pd.to_numeric(df['TIME OCC'], errors='coerce')

# Validate time ranges (0000 to 2359 with minutes < 60)
valid_time_mask = (
    (df['TIME_OCC_numeric'] >= 0) & 
    (df['TIME_OCC_numeric'] <= 2359) &
    ((df['TIME_OCC_numeric'] % 100) < 60)  # Minutes must be < 60
)

# Extract hour correctly: divide by 100 to get hour portion
df['hour'] = np.where(
    valid_time_mask,
    (df['TIME_OCC_numeric'] // 100).astype(int),
    np.nan
)

# Create missing time indicator (useful feature - knowing time is missing is information)
df['time_unknown'] = df['hour'].isnull().astype(int)

# Create additional time features
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_night'] = ((df['hour'] >= 18) | (df['hour'] <= 6)).astype(int)

# Fill NaN in is_night for unknown times (conservative: assume not night)
df['is_night'] = df['is_night'].fillna(0).astype(int)

print("Time features created successfully (CORRECTED)")
print(f"Date range: {df['DATE OCC'].min()} to {df['DATE OCC'].max()}")
print(f"Valid times: {valid_time_mask.sum():,} ({valid_time_mask.sum()/len(df)*100:.1f}%)")
print(f"Missing/invalid times: {(~valid_time_mask).sum():,} ({(~valid_time_mask).sum()/len(df)*100:.1f}%)")

# Show hour distribution to verify it makes sense
hour_dist = df['hour'].value_counts().sort_index()
print(f"\nTop 10 crime hours:")
for i, (hour, count) in enumerate(hour_dist.nlargest(10).items(), 1):
    print(f"  {i:2d}. {int(hour):2d}:00 - {count:>6,} crimes")

Time features created successfully (CORRECTED)
Date range: 2020-01-01 00:00:00 to 2025-05-29 00:00:00
Valid times: 1,004,991 (100.0%)
Missing/invalid times: 0 (0.0%)

Top 10 crime hours:
   1. 12:00 - 67,813 crimes
   2. 18:00 - 59,958 crimes
   3. 17:00 - 58,811 crimes
   4. 20:00 - 56,350 crimes
   5. 19:00 - 55,597 crimes
   6. 16:00 - 52,976 crimes
   7. 15:00 - 52,824 crimes
   8. 21:00 - 50,793 crimes
   9. 14:00 - 49,301 crimes
  10. 22:00 - 49,103 crimes


In [15]:
# Encode categorical variables (only non-leaky features)
categorical_features = ['Vict Sex', 'Vict Descent', 'AREA NAME']
label_encoders = {}

for feature in categorical_features:
    if feature in df.columns:
        # Fill missing values with 'Unknown'
        df[feature] = df[feature].fillna('Unknown')
        
        # Create label encoder
        le = LabelEncoder()
        df[f'{feature}_encoded'] = le.fit_transform(df[feature])
        label_encoders[feature] = le
        
        print(f"Encoded {feature}: {len(le.classes_)} unique values")

# Fill victim age with median
df['Vict Age'] = pd.to_numeric(df['Vict Age'], errors='coerce')
df['Vict Age'] = df['Vict Age'].fillna(df['Vict Age'].median())

print("\nCategorical encoding completed")

Encoded Vict Sex: 6 unique values
Encoded Vict Descent: 21 unique values
Encoded AREA NAME: 21 unique values

Categorical encoding completed


## 4. Target Variable Creation

Create a multi-class target variable for detailed crime type prediction based on the top 20 most common crime types.

In [None]:
# Analyze crime type distribution for multi-class classification
print("=== CRIME TYPE ANALYSIS FOR CLASSIFICATION ===")

if 'Crm Cd Desc' in df.columns:
    # Get crime type counts
    crime_counts = df['Crm Cd Desc'].value_counts()
    print(f"Total unique crime types: {len(crime_counts)}")
    
    # Show top 20 crime types
    print(f"\nTop 20 Crime Types:")
    for i, (crime, count) in enumerate(crime_counts.head(20).items(), 1):
        print(f"{i:2d}. {crime:<50} {count:>8,} ({count/len(df)*100:5.2f}%)")
    
    # Create multi-class target variable
    top_20_crimes = crime_counts.head(20).index.tolist()
    
    # Create categorical target
    df['crime_category'] = df['Crm Cd Desc'].apply(
        lambda x: x if x in top_20_crimes else 'OTHER'
    )
    
    # Display new target distribution
    category_dist = df['crime_category'].value_counts()
    print(f"\n=== NEW TARGET VARIABLE DISTRIBUTION ===")
    print(f"Total Categories: {len(category_dist)} (Top 20 + OTHER)")
    print(f"\nCategory Distribution:")
    for i, (category, count) in enumerate(category_dist.items(), 1):
        print(f"{i:2d}. {category:<50} {count:>8,} ({count/len(df)*100:5.2f}%)")
    
    # Visualize the distribution
    plt.figure(figsize=(20, 12))
    
    # Top 20 crimes detailed view
    plt.subplot(2, 2, 1)
    top_20_dist = category_dist[category_dist.index != 'OTHER'].head(20)
    plt.barh(range(len(top_20_dist)), top_20_dist.values)
    plt.yticks(range(len(top_20_dist)), [crime[:35] + '...' if len(crime) > 35 else crime for crime in top_20_dist.index])
    plt.xlabel('Count')
    plt.title('Top 20 Crime Categories (Detailed)')
    plt.gca().invert_yaxis()
    
    # Overall distribution pie chart
    plt.subplot(2, 2, 2)
    other_count = category_dist.get('OTHER', 0)
    top_20_total = category_dist.drop('OTHER', errors='ignore').sum()
    
    plt.pie([top_20_total, other_count], 
            labels=[f'Top 20 Categories\n({top_20_total:,})', f'OTHER\n({other_count:,})'], 
            autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightgray'])
    plt.title('Crime Distribution: Top 20 vs Others')
    
    # Top 10 for better readability
    plt.subplot(2, 2, 3)
    top_10_dist = category_dist.head(10)
    plt.bar(range(len(top_10_dist)), top_10_dist.values, color='steelblue', alpha=0.8)
    plt.xticks(range(len(top_10_dist)), [crime[:15] + '...' if len(crime) > 15 else crime for crime in top_10_dist.index], rotation=45, ha='right')
    plt.ylabel('Count')
    plt.title('Top 10 Crime Categories')
    plt.grid(True, alpha=0.3)
    
    # Class balance analysis
    plt.subplot(2, 2, 4)
    plt.axis('off')
    
    classification_summary = f"""
MULTI-CLASS CLASSIFICATION SUMMARY

Total Records: {len(df):,}
Number of Classes: {len(category_dist)}

Class Distribution:
  Most Common: {category_dist.index[0]}
  Count: {category_dist.iloc[0]:,} ({category_dist.iloc[0]/len(df)*100:.1f}%)
  
  Least Common: {category_dist.index[-1]}
  Count: {category_dist.iloc[-1]:,} ({category_dist.iloc[-1]/len(df)*100:.1f}%)
  
Class Imbalance Ratio: {category_dist.iloc[0] / category_dist.iloc[-1]:.1f}:1

Benefits of Multi-Class Approach:
‚Ä¢ Specific crime type predictions
‚Ä¢ Targeted prevention strategies  
‚Ä¢ Resource allocation per crime type
‚Ä¢ Actionable insights for police
‚Ä¢ Better understanding of crime patterns
"""
    
    plt.text(0.1, 0.9, classification_summary, transform=plt.gca().transAxes, 
             fontsize=11, verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen", alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nAdvantages of Multi-Class Crime Classification:")
    print(f"‚úì More actionable insights for specific crime prevention")
    print(f"‚úì Better resource allocation per crime type")
    print(f"‚úì Targeted patrol strategies for different crime categories")
    print(f"‚úì More nuanced understanding of crime patterns")
    print(f"‚úì Crime-specific temporal and geographic analysis")

else:
    print("Crime description column not found. Using binary classification.")
    df['crime_category'] = (df['Part 1-2'] == 1).astype(int)

## 5. Feature Selection

Select only features that would be available BEFORE a crime occurs to avoid data leakage.

In [None]:
# Select predictive features (available before crime occurs)
predictive_features = [
    'hour',                    # Time of day
    'day_of_week',            # Day of week
    'month',                  # Month
    'year',                   # Year
    'is_weekend',             # Weekend indicator
    'is_night',               # Night time indicator
    'LAT',                    # Latitude
    'LON',                    # Longitude
    'AREA',                   # Area code
    'Rpt Dist No',           # Reporting district
    'Vict Age',              # Victim age
    'Vict Sex_encoded',      # Victim sex
    'Vict Descent_encoded',  # Victim ethnicity
    'AREA NAME_encoded'      # Area name
]

# Check which features are available in the dataset
available_features = []
for feature in predictive_features:
    if feature in df.columns:
        available_features.append(feature)
    else:
        print(f"Feature '{feature}' not found in dataset")

print(f"\nUsing {len(available_features)} predictive features:")
for i, feature in enumerate(available_features, 1):
    print(f"{i:2d}. {feature}")

# Create final dataset with multi-class target
X = df[available_features].copy()
y = df['crime_category'].copy()

# Handle any remaining missing values
X = X.fillna(X.median())

# Encode the multi-class target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nFinal dataset: {X.shape[0]:,} samples, {X.shape[1]} features")
print(f"Target classes: {len(label_encoder.classes_)} crime categories")
print(f"Class distribution:")
for i, class_name in enumerate(label_encoder.classes_):
    count = np.sum(y_encoded == i)
    print(f"  {i:2d}. {class_name:<40} {count:>8,} ({count/len(y_encoded)*100:5.2f}%)")

## 6. Data Splitting

Split the data into training, validation, and test sets using temporal ordering for realistic evaluation.

In [None]:
# Create temporal splits (sorted by year for realistic time-based validation)
df_combined = X.copy()
df_combined['target'] = y_encoded  # Use encoded target for multi-class
df_combined = df_combined.sort_values('year')

# Split ratios: 60% train, 20% validation, 20% test
n_total = len(df_combined)
n_train = int(0.6 * n_total)
n_val = int(0.2 * n_total)

# Create splits
X_train = df_combined.iloc[:n_train][available_features]
y_train = df_combined.iloc[:n_train]['target']

X_val = df_combined.iloc[n_train:n_train+n_val][available_features]
y_val = df_combined.iloc[n_train:n_train+n_val]['target']

X_test = df_combined.iloc[n_train+n_val:][available_features]
y_test = df_combined.iloc[n_train+n_val:]['target']

print(f"Training set: {len(X_train):,} samples")
print(f"Validation set: {len(X_val):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Check class distribution in each set
print(f"\nClass distribution across splits:")
print(f"Number of classes: {len(label_encoder.classes_)}")

# Show top 5 classes distribution across splits
for split_name, y_split in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
    class_dist = pd.Series(y_split).value_counts().sort_index()
    print(f"\n{split_name} set - Top 5 classes:")
    for i in range(min(5, len(class_dist))):
        class_idx = class_dist.index[i]
        class_name = label_encoder.classes_[class_idx]
        count = class_dist.iloc[i]
        print(f"  {class_name:<40} {count:>6,} ({count/len(y_split)*100:4.1f}%)")

## 7. Multi-Class Model Training

Train an XGBoost classifier with multi-class objective for detailed crime type prediction.

In [20]:
# Configure XGBoost model for multi-class classification
num_classes = len(label_encoder.classes_)
print(f"Configuring XGBoost for {num_classes} crime categories...")

model = XGBClassifier(
    objective='multi:softprob',  # Multi-class classification with probabilities
    num_class=num_classes,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=150,  # Increased for better multi-class performance
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'  # Multi-class log loss
)

print("Training XGBoost multi-class model...")
model.fit(X_train, y_train, 
          eval_set=[(X_train, y_train), (X_val, y_val)],
          verbose=False)
print("Multi-class model training completed")

# Display class mapping for reference
print(f"\nClass Mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {i:2d}: {class_name}")

NameError: name 'label_encoder' is not defined

## 8. Model Evaluation

Evaluate the model performance on validation and test sets.

In [None]:
# Make multi-class predictions
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Get prediction probabilities for detailed analysis
y_train_proba = model.predict_proba(X_train)
y_val_proba = model.predict_proba(X_val)
y_test_proba = model.predict_proba(X_test)

print("Multi-class predictions completed")
print(f"Prediction shape: {y_test_pred.shape}")
print(f"Probability shape: {y_test_proba.shape}")

# Show sample predictions with confidence
print(f"\nSample predictions with confidence:")
sample_indices = np.random.choice(len(y_test), 5, replace=False)
for idx in sample_indices:
    true_class = label_encoder.classes_[y_test.iloc[idx]]
    pred_class = label_encoder.classes_[y_test_pred[idx]]
    confidence = y_test_proba[idx].max()
    print(f"  True: {true_class:<35} | Pred: {pred_class:<35} | Conf: {confidence:.3f}")

In [None]:
# Evaluate multi-class model performance
def evaluate_multiclass_model(y_true, y_pred, y_proba, dataset_name):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f"\n{dataset_name} Performance:")
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Weighted Precision: {precision:.3f}")
    print(f"  Weighted Recall: {recall:.3f}")
    print(f"  Weighted F1-Score: {f1:.3f}")
    
    # Top-3 accuracy (useful for multi-class)
    top3_accuracy = 0
    for i, (true_class, pred_probs) in enumerate(zip(y_true, y_proba)):
        top3_pred = np.argsort(pred_probs)[-3:]  # Top 3 predictions
        if true_class in top3_pred:
            top3_accuracy += 1
    top3_accuracy /= len(y_true)
    
    print(f"  Top-3 Accuracy: {top3_accuracy:.3f}")
    
    return {
        'accuracy': accuracy, 
        'precision': precision, 
        'recall': recall, 
        'f1': f1, 
        'top3_accuracy': top3_accuracy
    }

# Evaluate on all datasets
train_metrics = evaluate_multiclass_model(y_train, y_train_pred, y_train_proba, "Training")
val_metrics = evaluate_multiclass_model(y_val, y_val_pred, y_val_proba, "Validation")
test_metrics = evaluate_multiclass_model(y_test, y_test_pred, y_test_proba, "Test")

# Detailed classification report for test set
print(f"\n" + "="*80)
print(f"DETAILED CLASSIFICATION REPORT (Test Set)")
print(f"="*80)

class_names = [name[:30] + '...' if len(name) > 30 else name for name in label_encoder.classes_]
report = classification_report(y_test, y_test_pred, 
                             target_names=class_names,
                             digits=3, 
                             zero_division=0)
print(report)

## 9. Model Visualization

Create visualizations to understand model performance and feature importance.

In [None]:
# Multi-class model visualization
plt.figure(figsize=(20, 12))

# 1. Feature Importance
plt.subplot(2, 3, 1)
importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=True)

plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Feature Importance')
plt.title('Feature Importance (Multi-Class)')
plt.grid(True, alpha=0.3)

# 2. Model Performance Comparison
plt.subplot(2, 3, 2)
metrics_comparison = pd.DataFrame({
    'Train': [train_metrics['accuracy'], train_metrics['f1'], train_metrics['top3_accuracy']],
    'Validation': [val_metrics['accuracy'], val_metrics['f1'], val_metrics['top3_accuracy']],
    'Test': [test_metrics['accuracy'], test_metrics['f1'], test_metrics['top3_accuracy']]
})
metrics_comparison.index = ['Accuracy', 'F1-Score', 'Top-3 Accuracy']
metrics_comparison.plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Across Splits')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Class Distribution in Predictions
plt.subplot(2, 3, 3)
pred_dist = pd.Series(y_test_pred).value_counts().sort_index()
true_dist = pd.Series(y_test).value_counts().sort_index()

# Show top 10 classes only for clarity
top_10_classes = true_dist.head(10).index
pred_counts = [pred_dist.get(i, 0) for i in top_10_classes]
true_counts = [true_dist.get(i, 0) for i in top_10_classes]

x = np.arange(len(top_10_classes))
width = 0.35

plt.bar(x - width/2, true_counts, width, label='True', alpha=0.8)
plt.bar(x + width/2, pred_counts, width, label='Predicted', alpha=0.8)
plt.xlabel('Crime Class (Top 10)')
plt.ylabel('Count')
plt.title('True vs Predicted Class Distribution')
plt.xticks(x, [f'C{i}' for i in top_10_classes], rotation=45)
plt.legend()

# 4. Top-K Accuracy Analysis
plt.subplot(2, 3, 4)
k_values = range(1, min(11, len(label_encoder.classes_)))
top_k_accuracies = []

for k in k_values:
    top_k_acc = 0
    for i, (true_class, pred_probs) in enumerate(zip(y_test, y_test_proba)):
        top_k_pred = np.argsort(pred_probs)[-k:]
        if true_class in top_k_pred:
            top_k_acc += 1
    top_k_accuracies.append(top_k_acc / len(y_test))

plt.plot(k_values, top_k_accuracies, marker='o', linewidth=2)
plt.xlabel('K (Top-K Predictions)')
plt.ylabel('Accuracy')
plt.title('Top-K Accuracy Analysis')
plt.grid(True, alpha=0.3)

# 5. Per-Class Performance Heatmap
plt.subplot(2, 3, 5)
from sklearn.metrics import precision_recall_fscore_support
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    y_test, y_test_pred, average=None, zero_division=0
)

# Show top 15 classes with most support
top_15_indices = np.argsort(support)[-15:]
metrics_matrix = np.array([
    precision_per_class[top_15_indices],
    recall_per_class[top_15_indices],
    f1_per_class[top_15_indices]
])

class_labels = [label_encoder.classes_[i][:20] + '...' if len(label_encoder.classes_[i]) > 20 
                else label_encoder.classes_[i] for i in top_15_indices]

sns.heatmap(metrics_matrix, 
            xticklabels=class_labels,
            yticklabels=['Precision', 'Recall', 'F1-Score'],
            annot=True, fmt='.2f', cmap='RdYlBu_r')
plt.title('Per-Class Performance (Top 15)')
plt.xticks(rotation=45, ha='right')

# 6. Confusion Matrix (Top 10 Classes)
plt.subplot(2, 3, 6)
from sklearn.metrics import confusion_matrix

# Filter for top 10 classes for better visualization
mask = np.isin(y_test, top_10_classes) & np.isin(y_test_pred, top_10_classes)
y_test_filtered = y_test[mask]
y_pred_filtered = y_test_pred[mask]

if len(y_test_filtered) > 0:
    cm = confusion_matrix(y_test_filtered, y_pred_filtered, 
                         labels=top_10_classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=[f'C{i}' for i in top_10_classes],
                yticklabels=[f'C{i}' for i in top_10_classes])
    plt.title('Confusion Matrix (Top 10 Classes)')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plt.tight_layout()
plt.show()

print(f"\nMulti-Class Model Summary:")
print(f"‚úì Successfully trained model for {len(label_encoder.classes_)} crime categories")
print(f"‚úì Test Accuracy: {test_metrics['accuracy']:.3f}")
print(f"‚úì Test Top-3 Accuracy: {test_metrics['top3_accuracy']:.3f}")
print(f"‚úì Weighted F1-Score: {test_metrics['f1']:.3f}")

In [None]:
# Detailed Multi-Class Analysis and Crime Type Insights
plt.figure(figsize=(20, 15))

# 1. Crime Category Predictions Analysis
plt.subplot(3, 3, 1)
# Show prediction accuracy for top 10 crime types
class_accuracies = []
class_names_short = []
top_10_indices = np.argsort(pd.Series(y_test).value_counts().values)[-10:]

for class_idx in top_10_indices:
    mask = (y_test == class_idx)
    if np.sum(mask) > 0:
        accuracy = np.mean(y_test_pred[mask] == class_idx)
        class_accuracies.append(accuracy)
        class_names_short.append(label_encoder.classes_[class_idx][:25] + '...' 
                                if len(label_encoder.classes_[class_idx]) > 25 
                                else label_encoder.classes_[class_idx])

plt.barh(range(len(class_accuracies)), class_accuracies, color='steelblue', alpha=0.8)
plt.yticks(range(len(class_accuracies)), class_names_short)
plt.xlabel('Per-Class Accuracy')
plt.title('Accuracy by Crime Type (Top 10)')
plt.grid(True, alpha=0.3)

# 2. Prediction Confidence Distribution
plt.subplot(3, 3, 2)
max_probs = np.max(y_test_proba, axis=1)
plt.hist(max_probs, bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
plt.xlabel('Prediction Confidence')
plt.ylabel('Frequency')
plt.title('Model Confidence Distribution')
plt.axvline(np.mean(max_probs), color='red', linestyle='--', 
           label=f'Mean: {np.mean(max_probs):.3f}')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Most Confused Crime Pairs
plt.subplot(3, 3, 3)
from sklearn.metrics import confusion_matrix
cm_full = confusion_matrix(y_test, y_test_pred)

# Find most confused pairs (off-diagonal elements)
confusion_pairs = []
for i in range(len(cm_full)):
    for j in range(len(cm_full)):
        if i != j and cm_full[i, j] > 0:
            confusion_pairs.append((i, j, cm_full[i, j]))

# Sort by confusion count and show top pairs
confusion_pairs.sort(key=lambda x: x[2], reverse=True)
top_confusions = confusion_pairs[:8]

if top_confusions:
    pairs_labels = []
    confusion_counts = []
    for true_idx, pred_idx, count in top_confusions:
        true_name = label_encoder.classes_[true_idx][:15] + '...' if len(label_encoder.classes_[true_idx]) > 15 else label_encoder.classes_[true_idx]
        pred_name = label_encoder.classes_[pred_idx][:15] + '...' if len(label_encoder.classes_[pred_idx]) > 15 else label_encoder.classes_[pred_idx]
        pairs_labels.append(f"{true_name}\n‚Üí{pred_name}")
        confusion_counts.append(count)
    
    plt.barh(range(len(confusion_counts)), confusion_counts, color='salmon', alpha=0.8)
    plt.yticks(range(len(confusion_counts)), pairs_labels)
    plt.xlabel('Confusion Count')
    plt.title('Most Confused Crime Pairs')
    plt.gca().invert_yaxis()

# 4. Feature Importance for Multi-Class
plt.subplot(3, 3, 4)
top_features = importance_df.tail(10)
plt.bar(range(len(top_features)), top_features['importance'], color='darkgreen', alpha=0.8)
plt.xticks(range(len(top_features)), top_features['feature'], rotation=45, ha='right')
plt.ylabel('Importance')
plt.title('Top 10 Most Important Features')
plt.grid(True, alpha=0.3)

# 5. Crime Type Complexity Analysis
plt.subplot(3, 3, 5)
class_support = pd.Series(y_test).value_counts().sort_index()
class_f1_scores = f1_per_class
valid_indices = class_support.index[class_support > 10]  # Only classes with >10 samples

if len(valid_indices) > 0:
    support_vals = [class_support[i] for i in valid_indices]
    f1_vals = [class_f1_scores[i] for i in valid_indices]
    
    plt.scatter(support_vals, f1_vals, alpha=0.6, s=50)
    plt.xlabel('Class Support (Test Set)')
    plt.ylabel('F1-Score')
    plt.title('F1-Score vs Class Support')
    plt.grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(support_vals, f1_vals, 1)
    p = np.poly1d(z)
    plt.plot(sorted(support_vals), p(sorted(support_vals)), "r--", alpha=0.8)

# 6. Temporal Patterns in Predictions
plt.subplot(3, 3, 6)
# Analyze if certain crime types are predicted more in certain time periods
if 'hour' in X_test.columns:
    # Show prediction distribution by hour for top 3 crime types
    top_3_classes = pd.Series(y_test_pred).value_counts().head(3).index
    
    for i, class_idx in enumerate(top_3_classes):
        mask = (y_test_pred == class_idx)
        if np.sum(mask) > 0:
            hours = X_test.loc[mask, 'hour'].values
            plt.hist(hours, bins=24, alpha=0.5, label=f"Class {class_idx}", density=True)
    
    plt.xlabel('Hour of Day')
    plt.ylabel('Density')
    plt.title('Predicted Crime Types by Hour')
    plt.legend()
    plt.grid(True, alpha=0.3)

# 7. Geographic Distribution of Predictions
plt.subplot(3, 3, 7)
if 'LAT' in X_test.columns and 'LON' in X_test.columns:
    # Sample data for visualization
    sample_size = min(2000, len(X_test))
    sample_indices = np.random.choice(len(X_test), sample_size, replace=False)
    
    lat_sample = X_test.iloc[sample_indices]['LAT']
    lon_sample = X_test.iloc[sample_indices]['LON']
    pred_sample = y_test_pred[sample_indices]
    
    # Color by top 5 most common predicted classes
    top_5_classes = pd.Series(pred_sample).value_counts().head(5).index
    colors = plt.cm.Set1(np.linspace(0, 1, len(top_5_classes)))
    
    for i, class_idx in enumerate(top_5_classes):
        mask = (pred_sample == class_idx)
        if np.sum(mask) > 0:
            plt.scatter(lon_sample[mask], lat_sample[mask], 
                       c=[colors[i]], alpha=0.6, s=10, 
                       label=f"Class {class_idx}")
    
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Geographic Distribution of Predictions')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 8. Model Calibration Analysis
plt.subplot(3, 3, 8)
# Analyze if predicted probabilities are well calibrated
from sklearn.calibration import calibration_curve

# For binary version: convert to "most common class" vs "others"
most_common_class = pd.Series(y_test).value_counts().index[0]
y_binary = (y_test == most_common_class).astype(int)
y_prob_binary = y_test_proba[:, most_common_class]

if len(np.unique(y_binary)) > 1:  # Only if both classes present
    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_binary, y_prob_binary, n_bins=10)
    
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", 
             label="Model", color='blue')
    plt.plot([0, 1], [0, 1], "k:", label="Perfect calibration")
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.title('Calibration Curve (Most Common Class)')
    plt.legend()
    plt.grid(True, alpha=0.3)

# 9. Performance Summary
plt.subplot(3, 3, 9)
plt.axis('off')

performance_summary = f"""
MULTI-CLASS CRIME PREDICTION RESULTS

Total Classes: {len(label_encoder.classes_)}
Test Samples: {len(y_test):,}

Performance Metrics:
  Accuracy: {test_metrics['accuracy']:.3f}
  Top-3 Accuracy: {test_metrics['top3_accuracy']:.3f}
  Weighted F1: {test_metrics['f1']:.3f}
  Weighted Precision: {test_metrics['precision']:.3f}
  Weighted Recall: {test_metrics['recall']:.3f}

Best Performing Features:
  1. {importance_df.iloc[-1]['feature']}
  2. {importance_df.iloc[-2]['feature']}
  3. {importance_df.iloc[-3]['feature']}

Average Confidence: {np.mean(max_probs):.3f}
"""

plt.text(0.1, 0.9, performance_summary, transform=plt.gca().transAxes, 
         fontsize=11, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.8))

plt.tight_layout()
plt.show()

print(f"\nDetailed Multi-Class Analysis Complete!")
print(f"Model successfully predicts {len(label_encoder.classes_)} different crime types")
print(f"This enables targeted crime prevention strategies for each specific crime category")

## 10. Feature Analysis

Analyze which features are most important for crime prediction.

In [None]:
# Feature importance ranking
feature_ranking = importance_df.sort_values('importance', ascending=False)

print("Feature Importance Ranking:")
print("Rank  Feature                   Importance  Interpretation")
print("-" * 65)

interpretations = {
    'hour': 'Time of day patterns',
    'LAT': 'Geographic latitude hotspots',
    'LON': 'Geographic longitude hotspots',
    'day_of_week': 'Weekly crime patterns',
    'month': 'Seasonal variations',
    'AREA': 'Police area characteristics',
    'Rpt Dist No': 'District-level patterns',
    'Vict Age': 'Age demographic factors',
    'year': 'Long-term trends',
    'is_weekend': 'Weekend vs weekday',
    'is_night': 'Night vs day patterns',
    'Vict Sex_encoded': 'Gender patterns',
    'Vict Descent_encoded': 'Demographic patterns',
    'AREA NAME_encoded': 'Neighborhood factors'
}

for i, (_, row) in enumerate(feature_ranking.iterrows(), 1):
    feature = row['feature']
    importance = row['importance']
    interp = interpretations.get(feature, 'Crime factor')
    print(f"{i:2d}.   {feature:<22} {importance:8.4f}    {interp}")

## 11. Model Insights and Conclusions

Summary of findings and practical applications for law enforcement.

In [None]:
print("Multi-Class Crime Prediction Model - Final Summary")
print("=" * 60)

print(f"\nModel Configuration:")
print(f"  Model Type: XGBoost Multi-Class Classifier")
print(f"  Number of Crime Categories: {len(label_encoder.classes_)}")
print(f"  Training Objective: Multi-class softmax probability")
print(f"  Features Used: {len(available_features)}")

print(f"\nModel Performance:")
print(f"  Test Accuracy: {test_metrics['accuracy']:.1%}")
print(f"  Test Top-3 Accuracy: {test_metrics['top3_accuracy']:.1%}")
print(f"  Weighted F1-Score: {test_metrics['f1']:.3f}")
print(f"  Weighted Precision: {test_metrics['precision']:.3f}")
print(f"  Weighted Recall: {test_metrics['recall']:.3f}")

print(f"\nTop Crime Categories Predicted:")
pred_distribution = pd.Series(y_test_pred).value_counts().head(5)
for i, (class_idx, count) in enumerate(pred_distribution.items(), 1):
    class_name = label_encoder.classes_[class_idx]
    percentage = count / len(y_test_pred) * 100
    print(f"  {i}. {class_name:<45} {count:>5,} ({percentage:4.1f}%)")

print(f"\nMost Important Predictive Factors:")
top_5_features = importance_df.tail(5)
for i, (_, row) in enumerate(reversed(list(top_5_features.iterrows())), 1):
    print(f"  {i}. {row['feature']:<25} (importance: {row['importance']:.3f})")

print(f"\nAdvantages of Multi-Class Approach:")
print(f"  ‚úì Specific crime type predictions enable targeted interventions")
print(f"  ‚úì Resource allocation can be optimized per crime category")
print(f"  ‚úì Different patrol strategies for different crime types")
print(f"  ‚úì Crime-specific temporal and geographic insights")
print(f"  ‚úì Better understanding of crime patterns and relationships")

print(f"\nPractical Law Enforcement Applications:")
print(f"  ‚Ä¢ Burglary Prevention: Deploy resources during predicted high-risk times/areas")
print(f"  ‚Ä¢ Vehicle Theft: Focus patrols in parking areas during peak prediction periods")
print(f"  ‚Ä¢ Assault Cases: Increase presence in nightlife areas during high-risk hours")
print(f"  ‚Ä¢ Drug Offenses: Target known hotspots with specialized units")
print(f"  ‚Ä¢ Property Crimes: Coordinate with community watch programs")

print(f"\nModel Limitations & Considerations:")
print(f"  ‚ö† Predictions are probabilistic estimates, not certainties")
print(f"  ‚ö† Model performance varies by crime type frequency")
print(f"  ‚ö† Requires regular retraining with new crime data")
print(f"  ‚ö† Should complement, not replace, officer judgment")
print(f"  ‚ö† Ethical considerations regarding bias and fairness")

print(f"\nNext Steps for Deployment:")
print(f"  1. Validate model with domain experts (law enforcement)")
print(f"  2. Implement real-time prediction pipeline")
print(f"  3. Create interactive dashboard for police departments")
print(f"  4. Set up monitoring and alert systems")
print(f"  5. Establish feedback loop for continuous improvement")
print(f"  6. Conduct field trials and validation studies")

# Create a summary of class performance
print(f"\n" + "="*80)
print(f"CRIME CATEGORY PERFORMANCE SUMMARY")
print(f"="*80)

# Calculate per-class metrics
class_metrics = []
for i, class_name in enumerate(label_encoder.classes_):
    mask_true = (y_test == i)
    mask_pred = (y_test_pred == i)
    
    if np.sum(mask_true) > 0:  # Only if class exists in test set
        true_positives = np.sum((y_test == i) & (y_test_pred == i))
        precision = true_positives / np.sum(mask_pred) if np.sum(mask_pred) > 0 else 0
        recall = true_positives / np.sum(mask_true)
        support = np.sum(mask_true)
        
        class_metrics.append({
            'class': class_name[:40] + '...' if len(class_name) > 40 else class_name,
            'precision': precision,
            'recall': recall,
            'support': support
        })

# Sort by support and show top performing classes
class_metrics.sort(key=lambda x: x['support'], reverse=True)
print(f"{'Crime Type':<45} {'Precision':<10} {'Recall':<10} {'Support':<10}")
print(f"{'-'*45} {'-'*10} {'-'*10} {'-'*10}")

for metric in class_metrics[:15]:  # Show top 15
    print(f"{metric['class']:<45} {metric['precision']:<10.3f} {metric['recall']:<10.3f} {metric['support']:<10,}")

print(f"\nThis multi-class model provides actionable insights for {len(label_encoder.classes_)} specific crime types,")
print(f"enabling law enforcement to develop targeted prevention and response strategies.")

## 2.5. Comprehensive Exploratory Data Analysis (EDA)

This section provides in-depth analysis of crime patterns, distributions, and relationships in the Los Angeles crime data to guide our modeling approach.

In [None]:
# Dataset Overview and Summary Statistics
print("=== DATASET OVERVIEW ===")
print(f"Total Records: {len(df):,}")
print(f"Total Features: {len(df.columns)}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Data types and basic statistics
print(f"\nData Types:")
print(df.dtypes.value_counts())

print(f"\nNumerical Features Summary:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe())

# Check for class distribution
print(f"\nTarget Variable Analysis:")
if 'Part 1-2' in df.columns:
    part_dist = df['Part 1-2'].value_counts()
    print(f"Part 1 (Serious Crimes): {part_dist.get(1, 0):,}")
    print(f"Part 2 (Less Serious): {part_dist.get(2, 0):,}")
    
print(f"\nMost Common Crime Types:")
if 'Crm Cd Desc' in df.columns:
    print(df['Crm Cd Desc'].value_counts().head(10))

In [None]:
# Temporal Analysis - Crime Patterns Over Time
print("=== TEMPORAL ANALYSIS ===")

# Parse temporal features for analysis
df_temp = df.copy()
df_temp['Date Rptd'] = pd.to_datetime(df_temp['Date Rptd'], errors='coerce')
df_temp['DATE OCC'] = pd.to_datetime(df_temp['DATE OCC'], errors='coerce')

# Extract time components
df_temp['year'] = df_temp['DATE OCC'].dt.year
df_temp['month'] = df_temp['DATE OCC'].dt.month
df_temp['day_of_week'] = df_temp['DATE OCC'].dt.dayofweek
df_temp['hour'] = pd.to_numeric(df_temp['TIME OCC'].astype(str).str[:2], errors='coerce')

# Fill missing hours
df_temp['hour'] = df_temp['hour'].fillna(df_temp['hour'].median())

plt.figure(figsize=(18, 12))

# 1. Crimes by Hour
plt.subplot(3, 3, 1)
hourly_crimes = df_temp.groupby('hour').size()
hourly_crimes.plot(kind='line', marker='o')
plt.title('Crime Distribution by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Crimes')
plt.grid(True, alpha=0.3)

# 2. Crimes by Day of Week
plt.subplot(3, 3, 2)
daily_crimes = df_temp.groupby('day_of_week').size()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
plt.bar(range(7), daily_crimes.values)
plt.title('Crime Distribution by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Crimes')
plt.xticks(range(7), day_names)

# 3. Crimes by Month
plt.subplot(3, 3, 3)
monthly_crimes = df_temp.groupby('month').size()
monthly_crimes.plot(kind='bar')
plt.title('Crime Distribution by Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=0)

# 4. Crimes by Year
plt.subplot(3, 3, 4)
if len(df_temp['year'].unique()) > 1:
    yearly_crimes = df_temp.groupby('year').size()
    yearly_crimes.plot(kind='line', marker='o')
    plt.title('Crime Trends by Year')
    plt.xlabel('Year')
    plt.ylabel('Number of Crimes')
    plt.xticks(rotation=45)

# 5. Weekend vs Weekday
plt.subplot(3, 3, 5)
weekend_crimes = df_temp[df_temp['day_of_week'].isin([5, 6])].shape[0]
weekday_crimes = df_temp[~df_temp['day_of_week'].isin([5, 6])].shape[0]
plt.pie([weekday_crimes, weekend_crimes], labels=['Weekday', 'Weekend'], autopct='%1.1f%%')
plt.title('Weekday vs Weekend Crimes')

# 6. Peak Hours Analysis
plt.subplot(3, 3, 6)
peak_hours = hourly_crimes.nlargest(6)
plt.bar(peak_hours.index, peak_hours.values, color='red', alpha=0.7)
plt.title('Top 6 Crime Peak Hours')
plt.xlabel('Hour')
plt.ylabel('Number of Crimes')

# 7. Hourly Crime Heatmap by Day
plt.subplot(3, 3, 7)
if not df_temp.empty:
    heatmap_data = df_temp.groupby(['day_of_week', 'hour']).size().unstack(fill_value=0)
    sns.heatmap(heatmap_data, cmap='YlOrRd', cbar_kws={'label': 'Number of Crimes'})
    plt.title('Crime Heatmap: Day vs Hour')
    plt.xlabel('Hour')
    plt.ylabel('Day of Week')
    plt.yticks(range(7), day_names, rotation=0)

# 8. Monthly trend with Part 1-2
plt.subplot(3, 3, 8)
if 'Part 1-2' in df_temp.columns:
    monthly_part = df_temp.groupby(['month', 'Part 1-2']).size().unstack(fill_value=0)
    monthly_part.plot(kind='bar', stacked=True)
    plt.title('Monthly Crimes by Severity (Part 1-2)')
    plt.xlabel('Month')
    plt.ylabel('Number of Crimes')
    plt.legend(['Part 1 (Serious)', 'Part 2 (Less Serious)'])
    plt.xticks(rotation=0)

# 9. Time series plot
plt.subplot(3, 3, 9)
if len(df_temp) > 0:
    df_temp['date_only'] = df_temp['DATE OCC'].dt.date
    daily_counts = df_temp.groupby('date_only').size()
    if len(daily_counts) > 30:  # Only if we have enough data
        daily_counts.tail(90).plot()  # Last 90 days
        plt.title('Recent Daily Crime Counts (Last 90 days)')
        plt.xlabel('Date')
        plt.ylabel('Daily Crime Count')
        plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Print temporal insights
print(f"\nTemporal Insights:")
print(f"Peak Crime Hour: {hourly_crimes.idxmax()}:00 ({hourly_crimes.max():,} crimes)")
print(f"Lowest Crime Hour: {hourly_crimes.idxmin()}:00 ({hourly_crimes.min():,} crimes)")
print(f"Peak Crime Day: {day_names[daily_crimes.idxmax()]} ({daily_crimes.max():,} crimes)")
print(f"Peak Crime Month: {monthly_crimes.idxmax()} ({monthly_crimes.max():,} crimes)")
print(f"Weekend Crime Percentage: {weekend_crimes/(weekend_crimes+weekday_crimes)*100:.1f}%")

In [None]:
# Geographic Analysis - Crime Distribution by Location
print("=== GEOGRAPHIC ANALYSIS ===")

# Filter valid coordinates
df_geo = df[(df['LAT'] != 0) & (df['LON'] != 0)]
df_geo = df_geo[(df_geo['LAT'] > 33) & (df_geo['LAT'] < 35)]
df_geo = df_geo[(df_geo['LON'] > -119) & (df_geo['LON'] < -117)]

plt.figure(figsize=(20, 12))

# 1. Overall Crime Distribution Map
plt.subplot(2, 4, 1)
plt.scatter(df_geo['LON'], df_geo['LAT'], alpha=0.1, s=0.1, c='red')
plt.title('Crime Geographic Distribution')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, alpha=0.3)

# 2. Crime Density Heatmap
plt.subplot(2, 4, 2)
plt.hist2d(df_geo['LON'], df_geo['LAT'], bins=50, cmap='YlOrRd')
plt.title('Crime Density Heatmap')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.colorbar(label='Crime Count')

# 3. Crimes by Area
plt.subplot(2, 4, 3)
if 'AREA' in df.columns:
    area_crimes = df['AREA'].value_counts().head(10)
    area_crimes.plot(kind='bar')
    plt.title('Top 10 Areas by Crime Count')
    plt.xlabel('Area Code')
    plt.ylabel('Crime Count')
    plt.xticks(rotation=45)

# 4. Crimes by Area Name
plt.subplot(2, 4, 4)
if 'AREA NAME' in df.columns:
    area_name_crimes = df['AREA NAME'].value_counts().head(8)
    plt.barh(range(len(area_name_crimes)), area_name_crimes.values)
    plt.yticks(range(len(area_name_crimes)), area_name_crimes.index)
    plt.title('Top 8 Areas by Name')
    plt.xlabel('Crime Count')

# 5. High Risk vs Low Risk Geographic Distribution
plt.subplot(2, 4, 5)
if 'Part 1-2' in df_geo.columns:
    high_risk = df_geo[df_geo['Part 1-2'] == 1]
    low_risk = df_geo[df_geo['Part 1-2'] == 2]
    
    plt.scatter(low_risk['LON'], low_risk['LAT'], alpha=0.3, s=0.5, c='blue', label='Part 2 (Less Serious)')
    plt.scatter(high_risk['LON'], high_risk['LAT'], alpha=0.3, s=0.5, c='red', label='Part 1 (Serious)')
    plt.title('Crime Risk Distribution')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()

# 6. Reporting Districts
plt.subplot(2, 4, 6)
if 'Rpt Dist No' in df.columns:
    district_crimes = df['Rpt Dist No'].value_counts().head(10)
    district_crimes.plot(kind='bar')
    plt.title('Top 10 Reporting Districts')
    plt.xlabel('District Number')
    plt.ylabel('Crime Count')
    plt.xticks(rotation=45)

# 7. Premise Analysis
plt.subplot(2, 4, 7)
if 'Premis Desc' in df.columns:
    premise_crimes = df['Premis Desc'].value_counts().head(8)
    plt.pie(premise_crimes.values, labels=premise_crimes.index, autopct='%1.1f%%')
    plt.title('Crime Distribution by Premise Type')

# 8. Geographic Stats Summary
plt.subplot(2, 4, 8)
plt.axis('off')
geographic_stats = f"""
GEOGRAPHIC STATISTICS

Total Valid Coordinates: {len(df_geo):,}

Latitude Range:
  Min: {df_geo['LAT'].min():.4f}
  Max: {df_geo['LAT'].max():.4f}
  
Longitude Range:
  Min: {df_geo['LON'].min():.4f}
  Max: {df_geo['LON'].max():.4f}

Most Crime-Prone Area:
  {df['AREA NAME'].value_counts().index[0] if 'AREA NAME' in df.columns else 'N/A'}
  ({df['AREA NAME'].value_counts().iloc[0]:,} crimes)

Least Crime Area:
  {df['AREA NAME'].value_counts().index[-1] if 'AREA NAME' in df.columns else 'N/A'}
  ({df['AREA NAME'].value_counts().iloc[-1]:,} crimes)
"""

plt.text(0.1, 0.9, geographic_stats, transform=plt.gca().transAxes, 
         fontsize=10, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))

plt.tight_layout()
plt.show()

# Geographic insights
print(f"\nGeographic Insights:")
if 'AREA NAME' in df.columns:
    area_stats = df['AREA NAME'].value_counts()
    print(f"Highest Crime Area: {area_stats.index[0]} ({area_stats.iloc[0]:,} crimes)")
    print(f"Total Areas: {len(area_stats)}")
    print(f"Average Crimes per Area: {area_stats.mean():.0f}")
    print(f"Crime Concentration: Top 5 areas account for {area_stats.head(5).sum()/area_stats.sum()*100:.1f}% of all crimes")

In [None]:
# Demographic Analysis - Victim Profiles and Patterns
print("=== DEMOGRAPHIC ANALYSIS ===")

plt.figure(figsize=(18, 12))

# 1. Victim Age Distribution
plt.subplot(3, 4, 1)
if 'Vict Age' in df.columns:
    victim_ages = pd.to_numeric(df['Vict Age'], errors='coerce')
    victim_ages = victim_ages[(victim_ages > 0) & (victim_ages < 100)]  # Filter realistic ages
    
    plt.hist(victim_ages, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    plt.title('Victim Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.axvline(victim_ages.mean(), color='red', linestyle='--', label=f'Mean: {victim_ages.mean():.1f}')
    plt.legend()

# 2. Victim Sex Distribution
plt.subplot(3, 4, 2)
if 'Vict Sex' in df.columns:
    sex_counts = df['Vict Sex'].value_counts().head(6)
    plt.pie(sex_counts.values, labels=sex_counts.index, autopct='%1.1f%%')
    plt.title('Victim Gender Distribution')

# 3. Victim Descent Distribution
plt.subplot(3, 4, 3)
if 'Vict Descent' in df.columns:
    descent_counts = df['Vict Descent'].value_counts().head(8)
    plt.bar(range(len(descent_counts)), descent_counts.values)
    plt.title('Victim Ethnicity Distribution')
    plt.xlabel('Ethnicity Code')
    plt.ylabel('Count')
    plt.xticks(range(len(descent_counts)), descent_counts.index, rotation=45)

# 4. Age vs Crime Type (Part 1-2)
plt.subplot(3, 4, 4)
if 'Vict Age' in df.columns and 'Part 1-2' in df.columns:
    victim_ages = pd.to_numeric(df['Vict Age'], errors='coerce')
    valid_data = df[(victim_ages > 0) & (victim_ages < 100)]
    
    part1_ages = valid_data[valid_data['Part 1-2'] == 1]['Vict Age']
    part2_ages = valid_data[valid_data['Part 1-2'] == 2]['Vict Age']
    
    plt.hist([part2_ages, part1_ages], bins=20, alpha=0.7, label=['Part 2 (Less Serious)', 'Part 1 (Serious)'])
    plt.title('Age Distribution by Crime Severity')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.legend()

# 5. Gender vs Crime Type
plt.subplot(3, 4, 5)
if 'Vict Sex' in df.columns and 'Part 1-2' in df.columns:
    gender_crime = pd.crosstab(df['Vict Sex'], df['Part 1-2'])
    gender_crime.plot(kind='bar')
    plt.title('Crime Severity by Gender')
    plt.xlabel('Gender')
    plt.ylabel('Count')
    plt.legend(['Part 1 (Serious)', 'Part 2 (Less Serious)'])
    plt.xticks(rotation=45)

# 6. Age Groups Analysis
plt.subplot(3, 4, 6)
if 'Vict Age' in df.columns:
    victim_ages = pd.to_numeric(df['Vict Age'], errors='coerce')
    valid_ages = victim_ages[(victim_ages > 0) & (victim_ages < 100)]
    
    age_groups = pd.cut(valid_ages, bins=[0, 18, 25, 35, 50, 65, 100], 
                       labels=['0-17', '18-24', '25-34', '35-49', '50-64', '65+'])
    age_group_counts = age_groups.value_counts().sort_index()
    
    plt.bar(range(len(age_group_counts)), age_group_counts.values, color='lightcoral')
    plt.title('Victims by Age Groups')
    plt.xlabel('Age Group')
    plt.ylabel('Count')
    plt.xticks(range(len(age_group_counts)), age_group_counts.index, rotation=45)

# 7. Hour vs Gender
plt.subplot(3, 4, 7)
if 'TIME OCC' in df.columns and 'Vict Sex' in df.columns:
    df_temp['hour'] = pd.to_numeric(df_temp['TIME OCC'].astype(str).str[:2], errors='coerce')
    df_temp['hour'] = df_temp['hour'].fillna(df_temp['hour'].median())
    
    gender_hour = df_temp.groupby(['hour', 'Vict Sex']).size().unstack(fill_value=0)
    top_genders = df['Vict Sex'].value_counts().head(3).index
    
    for gender in top_genders:
        if gender in gender_hour.columns:
            plt.plot(gender_hour.index, gender_hour[gender], label=gender, marker='o', markersize=3)
    
    plt.title('Crime Patterns by Hour and Gender')
    plt.xlabel('Hour')
    plt.ylabel('Crime Count')
    plt.legend()
    plt.grid(True, alpha=0.3)

# 8. Demographic Summary Stats
plt.subplot(3, 4, 8)
plt.axis('off')

demo_stats = ""
if 'Vict Age' in df.columns:
    ages = pd.to_numeric(df['Vict Age'], errors='coerce')
    valid_ages = ages[(ages > 0) & (ages < 100)]
    demo_stats += f"Age Statistics:\n  Mean: {valid_ages.mean():.1f}\n  Median: {valid_ages.median():.1f}\n  Most Common: {valid_ages.mode().iloc[0] if len(valid_ages.mode()) > 0 else 'N/A'}\n\n"

if 'Vict Sex' in df.columns:
    gender_stats = df['Vict Sex'].value_counts()
    demo_stats += f"Gender Distribution:\n"
    for gender, count in gender_stats.head(3).items():
        demo_stats += f"  {gender}: {count:,} ({count/len(df)*100:.1f}%)\n"

demo_stats += f"\nTotal Victims Analyzed: {len(df):,}"

plt.text(0.1, 0.9, demo_stats, transform=plt.gca().transAxes, 
         fontsize=10, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))

# 9-12: Additional demographic plots
plt.subplot(3, 4, 9)
if 'Vict Sex' in df.columns and 'Vict Age' in df.columns:
    # Age boxplot by gender
    valid_df = df[pd.to_numeric(df['Vict Age'], errors='coerce').between(0, 100)]
    top_genders = valid_df['Vict Sex'].value_counts().head(3).index
    
    age_by_gender = [pd.to_numeric(valid_df[valid_df['Vict Sex'] == gender]['Vict Age'], errors='coerce').dropna() 
                     for gender in top_genders]
    
    plt.boxplot(age_by_gender, labels=top_genders)
    plt.title('Age Distribution by Gender')
    plt.xlabel('Gender')
    plt.ylabel('Age')

plt.subplot(3, 4, 10)
if 'Vict Descent' in df.columns and 'Part 1-2' in df.columns:
    # Ethnicity vs Crime Severity
    descent_crime = pd.crosstab(df['Vict Descent'], df['Part 1-2'])
    descent_crime.head(6).plot(kind='bar', stacked=True)
    plt.title('Crime Severity by Ethnicity (Top 6)')
    plt.xlabel('Ethnicity')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(['Part 1', 'Part 2'])

plt.subplot(3, 4, 11)
# Age vs Hour correlation
if 'Vict Age' in df.columns and 'TIME OCC' in df.columns:
    df_temp['age'] = pd.to_numeric(df_temp['Vict Age'], errors='coerce')
    valid_temp = df_temp[(df_temp['age'] > 0) & (df_temp['age'] < 100)]
    
    plt.scatter(valid_temp['hour'], valid_temp['age'], alpha=0.1, s=1)
    plt.title('Age vs Hour of Crime')
    plt.xlabel('Hour')
    plt.ylabel('Age')
    plt.grid(True, alpha=0.3)

plt.subplot(3, 4, 12)
# Crime count by demographics
if 'Vict Sex' in df.columns and 'Vict Descent' in df.columns:
    demo_combo = df.groupby(['Vict Sex', 'Vict Descent']).size().reset_index(name='count')
    top_combos = demo_combo.nlargest(8, 'count')
    
    labels = [f"{row['Vict Sex']}-{row['Vict Descent']}" for _, row in top_combos.iterrows()]
    plt.barh(range(len(top_combos)), top_combos['count'])
    plt.yticks(range(len(top_combos)), labels)
    plt.title('Top Victim Demographics')
    plt.xlabel('Crime Count')

plt.tight_layout()
plt.show()

# Print demographic insights
print(f"\nDemographic Insights:")
if 'Vict Age' in df.columns:
    ages = pd.to_numeric(df['Vict Age'], errors='coerce')
    valid_ages = ages[(ages > 0) & (ages < 100)]
    print(f"Average Victim Age: {valid_ages.mean():.1f} years")
    print(f"Most Vulnerable Age Group: {pd.cut(valid_ages, bins=[0,18,25,35,50,65,100]).value_counts().idxmax()}")

if 'Vict Sex' in df.columns:
    gender_stats = df['Vict Sex'].value_counts()
    print(f"Most Affected Gender: {gender_stats.index[0]} ({gender_stats.iloc[0]:,} cases, {gender_stats.iloc[0]/len(df)*100:.1f}%)")

In [None]:
# Crime Type and Severity Analysis
print("=== CRIME TYPE AND SEVERITY ANALYSIS ===")

plt.figure(figsize=(18, 10))

# 1. Crime Code Distribution
plt.subplot(2, 4, 1)
if 'Crm Cd' in df.columns:
    crime_codes = df['Crm Cd'].value_counts().head(10)
    plt.bar(range(len(crime_codes)), crime_codes.values)
    plt.title('Top 10 Crime Codes')
    plt.xlabel('Crime Code Rank')
    plt.ylabel('Count')
    plt.xticks(range(len(crime_codes)), [f'{int(code)}' for code in crime_codes.index], rotation=45)

# 2. Crime Description
plt.subplot(2, 4, 2)
if 'Crm Cd Desc' in df.columns:
    crime_desc = df['Crm Cd Desc'].value_counts().head(8)
    plt.barh(range(len(crime_desc)), crime_desc.values)
    plt.yticks(range(len(crime_desc)), [desc[:20] + '...' if len(desc) > 20 else desc for desc in crime_desc.index])
    plt.title('Top Crime Types')
    plt.xlabel('Count')

# 3. Part 1 vs Part 2 Distribution
plt.subplot(2, 4, 3)
if 'Part 1-2' in df.columns:
    part_dist = df['Part 1-2'].value_counts()
    labels = ['Part 1 (Serious)', 'Part 2 (Less Serious)']
    colors = ['red', 'orange']
    plt.pie(part_dist.values, labels=labels, colors=colors, autopct='%1.1f%%')
    plt.title('Crime Severity Distribution')

# 4. Status Analysis
plt.subplot(2, 4, 4)
if 'Status Desc' in df.columns:
    status_counts = df['Status Desc'].value_counts().head(8)
    plt.bar(range(len(status_counts)), status_counts.values, color='lightblue')
    plt.title('Case Status Distribution')
    plt.xlabel('Status Type')
    plt.ylabel('Count')
    plt.xticks(range(len(status_counts)), [s[:10] for s in status_counts.index], rotation=45)

# 5. Weapon Usage
plt.subplot(2, 4, 5)
if 'Weapon Desc' in df.columns:
    weapon_counts = df['Weapon Desc'].value_counts().head(8)
    plt.barh(range(len(weapon_counts)), weapon_counts.values, color='darkred')
    plt.yticks(range(len(weapon_counts)), [w[:15] for w in weapon_counts.index])
    plt.title('Weapon Types Used')
    plt.xlabel('Count')

# 6. Crime Severity by Hour
plt.subplot(2, 4, 6)
if 'Part 1-2' in df.columns and 'TIME OCC' in df.columns:
    df_temp['hour'] = pd.to_numeric(df_temp['TIME OCC'].astype(str).str[:2], errors='coerce')
    df_temp['hour'] = df_temp['hour'].fillna(df_temp['hour'].median())
    
    hourly_severity = df_temp.groupby(['hour', 'Part 1-2']).size().unstack(fill_value=0)
    hourly_severity.plot(kind='line', ax=plt.gca(), color=['red', 'orange'])
    plt.title('Crime Severity Patterns by Hour')
    plt.xlabel('Hour')
    plt.ylabel('Crime Count')
    plt.legend(['Part 1 (Serious)', 'Part 2 (Less Serious)'])
    plt.grid(True, alpha=0.3)

# 7. MO (Modus Operandi) Analysis
plt.subplot(2, 4, 7)
if 'Mocodes' in df.columns:
    mo_counts = df[df['Mocodes'].notna()]['Mocodes'].value_counts().head(8)
    if len(mo_counts) > 0:
        plt.bar(range(len(mo_counts)), mo_counts.values, color='purple', alpha=0.7)
        plt.title('Modus Operandi Patterns')
        plt.xlabel('MO Code')
        plt.ylabel('Count')
        plt.xticks(range(len(mo_counts)), mo_counts.index, rotation=45)
    else:
        plt.text(0.5, 0.5, 'No MO Data Available', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Modus Operandi Patterns')

# 8. Crime Trends Summary
plt.subplot(2, 4, 8)
plt.axis('off')

crime_summary = f"""
CRIME TYPE SUMMARY

Total Crime Records: {len(df):,}
"""

if 'Part 1-2' in df.columns:
    part_counts = df['Part 1-2'].value_counts()
    crime_summary += f"""
Severity Breakdown:
  Part 1 (Serious): {part_counts.get(1, 0):,} ({part_counts.get(1, 0)/len(df)*100:.1f}%)
  Part 2 (Less Serious): {part_counts.get(2, 0):,} ({part_counts.get(2, 0)/len(df)*100:.1f}%)
"""

if 'Crm Cd Desc' in df.columns:
    top_crime = df['Crm Cd Desc'].value_counts().iloc[0]
    crime_summary += f"""
Most Common Crime:
  {df['Crm Cd Desc'].value_counts().index[0][:25]}...
  ({top_crime:,} cases)
"""

if 'Status Desc' in df.columns:
    status_stats = df['Status Desc'].value_counts()
    crime_summary += f"""
Case Resolution:
  {status_stats.index[0]}: {status_stats.iloc[0]:,} cases
  Resolution Rate: {(status_stats.get('INVESTIGATED', 0) + status_stats.get('CLOSED', 0))/len(df)*100:.1f}%
"""

plt.text(0.1, 0.9, crime_summary, transform=plt.gca().transAxes, 
         fontsize=9, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcyan"))

plt.tight_layout()
plt.show()

# Print crime type insights
print(f"\nCrime Type Insights:")
if 'Crm Cd Desc' in df.columns:
    crime_types = df['Crm Cd Desc'].value_counts()
    print(f"Most Common Crime: {crime_types.index[0]} ({crime_types.iloc[0]:,} cases)")
    print(f"Total Crime Types: {len(crime_types)}")
    print(f"Crime Concentration: Top 10 crimes represent {crime_types.head(10).sum()/crime_types.sum()*100:.1f}% of all cases")

if 'Part 1-2' in df.columns:
    part_stats = df['Part 1-2'].value_counts()
    print(f"Serious Crime Rate (Part 1): {part_stats.get(1, 0)/len(df)*100:.1f}%")

In [None]:
# Correlation Analysis and Feature Relationships
print("=== CORRELATION ANALYSIS ===")

# Prepare numerical features for correlation analysis
numerical_features = ['LAT', 'LON', 'AREA', 'Rpt Dist No', 'Vict Age']

# Add time features if they exist
if 'DATE OCC' in df.columns:
    df_corr = df.copy()
    df_corr['DATE OCC'] = pd.to_datetime(df_corr['DATE OCC'], errors='coerce')
    df_corr['year'] = df_corr['DATE OCC'].dt.year
    df_corr['month'] = df_corr['DATE OCC'].dt.month
    df_corr['day_of_week'] = df_corr['DATE OCC'].dt.dayofweek
    df_corr['hour'] = pd.to_numeric(df_corr['TIME OCC'].astype(str).str[:2], errors='coerce')
    df_corr['hour'] = df_corr['hour'].fillna(df_corr['hour'].median())
    
    numerical_features.extend(['year', 'month', 'day_of_week', 'hour'])

# Add target variable
if 'Part 1-2' in df.columns:
    df_corr['high_risk'] = (df_corr['Part 1-2'] == 1).astype(int)
    numerical_features.append('high_risk')

# Select available numerical features
available_numerical = [col for col in numerical_features if col in df_corr.columns]

# Clean numerical data
df_numerical = df_corr[available_numerical].copy()
for col in df_numerical.columns:
    if col not in ['high_risk']:  # Don't modify target variable
        df_numerical[col] = pd.to_numeric(df_numerical[col], errors='coerce')

# Fill missing values with median
df_numerical = df_numerical.fillna(df_numerical.median())

plt.figure(figsize=(16, 12))

# 1. Correlation Matrix Heatmap
plt.subplot(2, 3, 1)
correlation_matrix = df_numerical.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix')

# 2. High Risk Correlation
plt.subplot(2, 3, 2)
if 'high_risk' in df_numerical.columns:
    risk_correlations = correlation_matrix['high_risk'].drop('high_risk').abs().sort_values(ascending=False)
    plt.bar(range(len(risk_correlations)), risk_correlations.values, color='crimson', alpha=0.7)
    plt.title('Features Correlated with High Risk')
    plt.xlabel('Features')
    plt.ylabel('Absolute Correlation')
    plt.xticks(range(len(risk_correlations)), risk_correlations.index, rotation=45)

# 3. Time vs Geographic Correlation
plt.subplot(2, 3, 3)
if all(col in df_numerical.columns for col in ['hour', 'LAT', 'LON']):
    plt.scatter(df_numerical['hour'], df_numerical['LAT'], alpha=0.1, s=1, c='blue', label='Latitude')
    plt.scatter(df_numerical['hour'], df_numerical['LON'], alpha=0.1, s=1, c='red', label='Longitude')
    plt.title('Time vs Geographic Distribution')
    plt.xlabel('Hour')
    plt.ylabel('Coordinate Value')
    plt.legend()

# 4. Age vs Risk Analysis
plt.subplot(2, 3, 4)
if all(col in df_numerical.columns for col in ['Vict Age', 'high_risk']):
    valid_ages = df_numerical[(df_numerical['Vict Age'] > 0) & (df_numerical['Vict Age'] < 100)]
    
    low_risk_ages = valid_ages[valid_ages['high_risk'] == 0]['Vict Age']
    high_risk_ages = valid_ages[valid_ages['high_risk'] == 1]['Vict Age']
    
    plt.boxplot([low_risk_ages, high_risk_ages], labels=['Low Risk', 'High Risk'])
    plt.title('Victim Age by Risk Level')
    plt.ylabel('Age')

# 5. Geographic Risk Patterns
plt.subplot(2, 3, 5)
if all(col in df_numerical.columns for col in ['LAT', 'LON', 'high_risk']):
    high_risk_crimes = df_numerical[df_numerical['high_risk'] == 1]
    low_risk_crimes = df_numerical[df_numerical['high_risk'] == 0]
    
    # Sample data for better visualization
    if len(high_risk_crimes) > 5000:
        high_risk_sample = high_risk_crimes.sample(5000, random_state=42)
        low_risk_sample = low_risk_crimes.sample(5000, random_state=42)
    else:
        high_risk_sample = high_risk_crimes
        low_risk_sample = low_risk_crimes
    
    plt.scatter(low_risk_sample['LON'], low_risk_sample['LAT'], 
               alpha=0.3, s=1, c='blue', label='Low Risk')
    plt.scatter(high_risk_sample['LON'], high_risk_sample['LAT'], 
               alpha=0.5, s=1, c='red', label='High Risk')
    plt.title('Geographic Risk Distribution')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()

# 6. Correlation Summary Stats
plt.subplot(2, 3, 6)
plt.axis('off')

corr_summary = "CORRELATION INSIGHTS\n\n"

if 'high_risk' in correlation_matrix.columns:
    risk_corr = correlation_matrix['high_risk'].drop('high_risk').abs().sort_values(ascending=False)
    corr_summary += f"Strongest Risk Predictors:\n"
    for i, (feature, corr_val) in enumerate(risk_corr.head(5).items()):
        corr_summary += f"  {i+1}. {feature}: {corr_val:.3f}\n"
    
    corr_summary += f"\nWeakest Risk Predictors:\n"
    for i, (feature, corr_val) in enumerate(risk_corr.tail(3).items()):
        corr_summary += f"  {feature}: {corr_val:.3f}\n"

# Find highly correlated feature pairs
high_corr_pairs = []
for i, col1 in enumerate(correlation_matrix.columns):
    for j, col2 in enumerate(correlation_matrix.columns):
        if i < j and abs(correlation_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append((col1, col2, correlation_matrix.iloc[i, j]))

if high_corr_pairs:
    corr_summary += f"\nHighly Correlated Pairs (>0.7):\n"
    for col1, col2, corr_val in high_corr_pairs[:3]:
        corr_summary += f"  {col1} ‚Üî {col2}: {corr_val:.3f}\n"

plt.text(0.1, 0.9, corr_summary, transform=plt.gca().transAxes, 
         fontsize=10, verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen", alpha=0.7))

plt.tight_layout()
plt.show()

# Print correlation insights
print(f"\nCorrelation Analysis Results:")
print(f"Total numerical features analyzed: {len(available_numerical)}")

if 'high_risk' in correlation_matrix.columns:
    risk_correlations = correlation_matrix['high_risk'].drop('high_risk').abs().sort_values(ascending=False)
    print(f"\nStrongest predictors of high-risk crimes:")
    for i, (feature, corr) in enumerate(risk_correlations.head(3).items(), 1):
        print(f"  {i}. {feature}: {corr:.3f} correlation")
    
    print(f"\nFeature pairs with high correlation (potential multicollinearity):")
    for col1, col2, corr_val in high_corr_pairs[:3]:
        print(f"  {col1} ‚Üî {col2}: {corr_val:.3f}")

# Statistical significance tests (if scipy available)
try:
    from scipy.stats import chi2_contingency
    
    # Test independence between categorical variables
    if all(col in df.columns for col in ['Vict Sex', 'Part 1-2']):
        contingency_table = pd.crosstab(df['Vict Sex'], df['Part 1-2'])
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        print(f"\nChi-square test (Gender vs Crime Severity):")
        print(f"  p-value: {p_value:.4f} ({'Significant' if p_value < 0.05 else 'Not significant'})")
        
except ImportError:
    print("\nScipy not available for statistical significance tests")

## EDA Summary and Key Insights

### Target Variable: High Risk Crime Classification
- **Dependent Variable**: `high_risk` (binary: 1 for Part I crimes, 0 for Part II crimes)
- **Independent Variables**: Temporal, Geographic, and Demographic features

### Key Findings from Exploratory Data Analysis:

**Temporal Patterns:**
- Peak crime hours: Evening and late night periods show highest activity
- Weekly patterns: Certain days show higher crime concentration  
- Seasonal trends: Monthly variations in crime frequency and types
- Weekend vs weekday differences in crime patterns

**Geographic Distribution:**  
- Crime hotspots concentrate in specific LAT/LON coordinates
- Area codes show distinct crime density patterns
- Spatial clustering of high-risk vs low-risk crimes
- Geographic correlation with demographic factors

**Demographic Insights:**
- Age distribution varies by crime severity level
- Gender patterns differ between crime types
- Victim demographics correlate with crime risk levels
- Age-risk relationships show distinct patterns

**Crime Type Analysis:**
- Part I crimes (high-risk) vs Part II crimes show different characteristics
- Weapon usage patterns vary by crime severity
- Crime status distribution indicates resolution patterns
- Modus operandi patterns provide behavioral insights

**Feature Relationships:**
- Strong correlations between temporal and geographic variables
- Age and location show interaction effects
- Multicollinearity considerations for model building
- Feature importance varies by crime type

### Data Quality Assessment:
- Missing value patterns identified and addressed
- Outlier detection completed for key variables
- Data distribution characteristics documented
- Feature engineering opportunities identified

This comprehensive EDA provides the foundation for building predictive models to classify high-risk crimes based on temporal, geographic, and demographic patterns in Los Angeles crime data.

## District/Area-Wise Crime Distribution Analysis

Comprehensive analysis of crime distribution across Los Angeles police districts and areas for strategic resource allocation and targeted law enforcement.

In [None]:
# === COMPREHENSIVE DISTRICT/AREA-WISE CRIME ANALYSIS ===
print("="*80)
print("         COMPLETE LOS ANGELES DISTRICT CRIME DISTRIBUTION")
print("="*80)

# Check available district/area columns
district_columns = ['AREA NAME', 'AREA', 'Rpt Dist No']
available_district_cols = [col for col in district_columns if col in df.columns]
print(f"Available district information: {available_district_cols}")

# 1. AREA NAME Analysis (Police Districts/Divisions)
if 'AREA NAME' in df.columns:
    area_name_stats = df['AREA NAME'].value_counts().sort_values(ascending=False)
    
    print(f"\nüìç TOTAL LA POLICE DISTRICTS (AREA NAMES): {len(area_name_stats)}")
    print(f"üìä TOTAL CRIME RECORDS: {len(df):,}")
    print(f"üìà AVERAGE CRIMES PER DISTRICT: {area_name_stats.mean():.0f}")
    print(f"üìâ MEDIAN CRIMES PER DISTRICT: {area_name_stats.median():.0f}")
    print(f"üî• HIGHEST CRIME DISTRICT: {area_name_stats.index[0]} ({area_name_stats.iloc[0]:,} crimes)")
    print(f"üü¢ LOWEST CRIME DISTRICT: {area_name_stats.index[-1]} ({area_name_stats.iloc[-1]:,} crimes)")
    
    print(f"\nüèÜ ALL DISTRICTS RANKED BY CRIME COUNT:")
    print("-" * 70)
    for i, (district, count) in enumerate(area_name_stats.items(), 1):
        percentage = (count / len(df)) * 100
        print(f"{i:2d}. {district:<35} {count:>8,} crimes ({percentage:>5.1f}%)")
    
    print(f"\nüìä DISTRICT CRIME CONCENTRATION:")
    top_5_crimes = area_name_stats.head(5).sum()
    top_10_crimes = area_name_stats.head(10).sum()
    top_15_crimes = area_name_stats.head(15).sum()
    
    print(f"   Top 5 districts:  {top_5_crimes:>8,} crimes ({top_5_crimes/len(df)*100:>5.1f}%)")
    print(f"   Top 10 districts: {top_10_crimes:>8,} crimes ({top_10_crimes/len(df)*100:>5.1f}%)")
    print(f"   Top 15 districts: {top_15_crimes:>8,} crimes ({top_15_crimes/len(df)*100:>5.1f}%)")

# 2. AREA Code Analysis (Numeric Area Codes)
if 'AREA' in df.columns:
    area_code_stats = df['AREA'].value_counts().sort_values(ascending=False)
    
    print(f"\nüìç TOTAL AREA CODES: {len(area_code_stats)}")
    print(f"üìà AVERAGE CRIMES PER AREA CODE: {area_code_stats.mean():.0f}")
    
    print(f"\nüî• TOP 20 AREA CODES BY CRIME COUNT:")
    print("-" * 50)
    for i, (area_code, count) in enumerate(area_code_stats.head(20).items(), 1):
        percentage = (count / len(df)) * 100
        print(f"{i:2d}. Area Code {area_code:<8} {count:>8,} crimes ({percentage:>5.1f}%)")

# 3. Reporting Districts Analysis (More granular)
if 'Rpt Dist No' in df.columns:
    rpt_dist_stats = df['Rpt Dist No'].value_counts().sort_values(ascending=False)
    
    print(f"\nüìç TOTAL REPORTING DISTRICTS: {len(rpt_dist_stats)}")
    print(f"üìà AVERAGE CRIMES PER REPORTING DISTRICT: {rpt_dist_stats.mean():.0f}")
    
    print(f"\nüî• TOP 20 REPORTING DISTRICTS BY CRIME COUNT:")
    print("-" * 60)
    for i, (district, count) in enumerate(rpt_dist_stats.head(20).items(), 1):
        percentage = (count / len(df)) * 100
        print(f"{i:2d}. Reporting District {district:<8} {count:>8,} crimes ({percentage:>5.1f}%)")

# Create comprehensive visualization
plt.figure(figsize=(25, 20))

# 1. All Districts Crime Distribution (AREA NAME)
plt.subplot(4, 4, 1)
if 'AREA NAME' in df.columns:
    all_districts = df['AREA NAME'].value_counts()
    colors = plt.cm.RdYlBu_r(np.linspace(0.2, 0.8, len(all_districts)))
    plt.barh(range(len(all_districts)), all_districts.values, color=colors)
    plt.yticks(range(len(all_districts)), [name[:20] + '...' if len(name) > 20 else name for name in all_districts.index])
    plt.xlabel('Crime Count')
    plt.title(f'ALL {len(all_districts)} LA POLICE DISTRICTS\n(AREA NAMES)')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)

# 2. Top 15 Districts Detail
plt.subplot(4, 4, 2)
if 'AREA NAME' in df.columns:
    top_15 = df['AREA NAME'].value_counts().head(15)
    colors = plt.cm.Reds(np.linspace(0.3, 1, len(top_15)))
    bars = plt.barh(range(len(top_15)), top_15.values, color=colors)
    plt.yticks(range(len(top_15)), [name[:18] for name in top_15.index])
    plt.xlabel('Crime Count')
    plt.title('TOP 15 DISTRICTS (Detailed)')
    plt.gca().invert_yaxis()
    
    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        plt.text(width + 100, bar.get_y() + bar.get_height()/2, 
                f'{int(width):,}', ha='left', va='center', fontsize=8)

# 3. District Crime Share (Pie Chart)
plt.subplot(4, 4, 3)
if 'AREA NAME' in df.columns:
    district_counts = df['AREA NAME'].value_counts()
    top_10 = district_counts.head(10)
    others = district_counts.iloc[10:].sum()
    
    plot_data = list(top_10.values) + [others]
    plot_labels = list(top_10.index) + [f'Others ({len(district_counts)-10} districts)']
    
    colors = plt.cm.Set3(np.linspace(0, 1, len(plot_data)))
    wedges, texts, autotexts = plt.pie(plot_data, labels=[label[:12] + '...' if len(label) > 12 else label for label in plot_labels], 
                                      autopct='%1.1f%%', startangle=90, colors=colors)
    plt.title('District Crime Share\n(Top 10 + Others)')
    
    # Improve text readability
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_weight('bold')

# 4. Area Code Distribution
plt.subplot(4, 4, 4)
if 'AREA' in df.columns:
    top_area_codes = df['AREA'].value_counts().head(20)
    plt.bar(range(len(top_area_codes)), top_area_codes.values, color='orange', alpha=0.8)
    plt.xlabel('Area Code Rank')
    plt.ylabel('Crime Count')
    plt.title('TOP 20 AREA CODES')
    plt.xticks(range(0, len(top_area_codes), 3), 
               [f'#{i+1}' for i in range(0, len(top_area_codes), 3)], rotation=45)
    plt.grid(True, alpha=0.3)

# 5. Crime Density Distribution
plt.subplot(4, 4, 5)
if 'AREA NAME' in df.columns:
    district_crimes = df['AREA NAME'].value_counts().values
    plt.hist(district_crimes, bins=15, color='skyblue', alpha=0.7, edgecolor='black')
    plt.xlabel('Crime Count per District')
    plt.ylabel('Number of Districts')
    plt.title('Crime Distribution Across Districts')
    plt.axvline(np.mean(district_crimes), color='red', linestyle='--', 
                label=f'Mean: {np.mean(district_crimes):.0f}')
    plt.axvline(np.median(district_crimes), color='green', linestyle='--', 
                label=f'Median: {np.median(district_crimes):.0f}')
    plt.legend()
    plt.grid(True, alpha=0.3)

# 6. District vs Crime Category (Heatmap)
plt.subplot(4, 4, 6)
if 'AREA NAME' in df.columns and 'crime_category' in df.columns:
    top_10_districts = df['AREA NAME'].value_counts().head(10).index
    top_8_crimes = df['crime_category'].value_counts().head(8).index
    
    district_crime_crosstab = pd.crosstab(
        df[df['AREA NAME'].isin(top_10_districts)]['AREA NAME'], 
        df[df['AREA NAME'].isin(top_10_districts)]['crime_category']
    )
    
    # Filter for top crimes
    available_crimes = [crime for crime in top_8_crimes if crime in district_crime_crosstab.columns]
    if available_crimes:
        district_crime_subset = district_crime_crosstab[available_crimes]
        
        sns.heatmap(district_crime_subset, annot=True, fmt='d', cmap='YlOrRd', 
                   cbar_kws={'label': 'Crime Count'})
        plt.title('Top 10 Districts vs Crime Types')
        plt.xlabel('Crime Category')
        plt.ylabel('District')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)

# 7. Reporting Districts Analysis
plt.subplot(4, 4, 7)
if 'Rpt Dist No' in df.columns:
    top_rpt_districts = df['Rpt Dist No'].value_counts().head(25)
    plt.scatter(range(len(top_rpt_districts)), top_rpt_districts.values, 
               alpha=0.7, s=60, c=top_rpt_districts.values, cmap='viridis')
    plt.xlabel('Reporting District Rank')
    plt.ylabel('Crime Count')
    plt.title('TOP 25 REPORTING DISTRICTS')
    plt.colorbar(label='Crime Count')
    plt.grid(True, alpha=0.3)

# 8. District Ranking Analysis
plt.subplot(4, 4, 8)
if 'AREA NAME' in df.columns:
    district_ranking = df['AREA NAME'].value_counts()
    rankings = range(1, len(district_ranking) + 1)
    
    plt.scatter(rankings, district_ranking.values, alpha=0.6, s=50, c='purple')
    plt.xlabel('District Rank')
    plt.ylabel('Crime Count (Log Scale)')
    plt.title('District Crime Count by Rank')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    # Annotate top 3
    for i, (district, count) in enumerate(district_ranking.head(3).items()):
        plt.annotate(f'{district[:10]}...', (i+1, count), xytext=(5, 5), 
                    textcoords='offset points', fontsize=8, 
                    bbox=dict(boxstyle="round,pad=0.2", facecolor="yellow", alpha=0.7))

# 9. Crime Temporal Patterns by District
plt.subplot(4, 4, 9)
if 'AREA NAME' in df.columns and 'hour' in df.columns:
    top_5_districts = df['AREA NAME'].value_counts().head(5).index
    district_hourly = df[df['AREA NAME'].isin(top_5_districts)].groupby(['AREA NAME', 'hour']).size().unstack(fill_value=0)
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(top_5_districts)))
    for i, district in enumerate(top_5_districts):
        if district in district_hourly.index:
            plt.plot(district_hourly.columns, district_hourly.loc[district], 
                    label=district[:12] + '...', marker='o', markersize=3, 
                    color=colors[i], linewidth=2)
    
    plt.xlabel('Hour of Day')
    plt.ylabel('Crime Count')
    plt.title('Hourly Patterns (Top 5 Districts)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)

# 10. Geographic Distribution
plt.subplot(4, 4, 10)
if 'LAT' in df.columns and 'LON' in df.columns and 'AREA NAME' in df.columns:
    # Sample data for visualization
    sample_size = min(5000, len(df))
    sample_df = df.sample(sample_size, random_state=42)
    
    top_districts = df['AREA NAME'].value_counts().head(5).index
    colors = plt.cm.Set1(np.linspace(0, 1, len(top_districts)))
    
    for i, district in enumerate(top_districts):
        district_data = sample_df[sample_df['AREA NAME'] == district]
        if len(district_data) > 0:
            plt.scatter(district_data['LON'], district_data['LAT'], 
                       c=[colors[i]], alpha=0.6, s=15, 
                       label=district[:15] + '...')
    
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Geographic Distribution by District')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)

# 11. District Performance Metrics
plt.subplot(4, 4, 11)
if 'AREA NAME' in df.columns and 'Part 1-2' in df.columns:
    district_severity = df.groupby('AREA NAME')['Part 1-2'].agg(['count', 'mean']).reset_index()
    district_severity.columns = ['District', 'Total_Crimes', 'Severity_Ratio']
    district_severity = district_severity.sort_values('Total_Crimes', ascending=False).head(15)
    
    plt.scatter(district_severity['Total_Crimes'], district_severity['Severity_Ratio'], 
               s=100, alpha=0.7, c='red')
    
    for i, row in district_severity.iterrows():
        plt.annotate(row['District'][:8] + '...', 
                    (row['Total_Crimes'], row['Severity_Ratio']), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.xlabel('Total Crime Count')
    plt.ylabel('Severity Ratio (Part 1 Crimes)')
    plt.title('District Crime Volume vs Severity')
    plt.grid(True, alpha=0.3)

# 12. Area Code vs District Mapping
plt.subplot(4, 4, 12)
if 'AREA' in df.columns and 'AREA NAME' in df.columns:
    area_district_mapping = df.groupby(['AREA', 'AREA NAME']).size().reset_index(name='count')
    top_area_district = area_district_mapping.nlargest(15, 'count')
    
    plt.barh(range(len(top_area_district)), top_area_district['count'], color='green', alpha=0.7)
    labels = [f"Area {row['AREA']}: {row['AREA NAME'][:15]}..." for _, row in top_area_district.iterrows()]
    plt.yticks(range(len(top_area_district)), labels)
    plt.xlabel('Crime Count')
    plt.title('Top 15 Area-District Combinations')
    plt.gca().invert_yaxis()

# 13-16: Summary Statistics and Additional Analysis
plt.subplot(4, 4, 13)
plt.axis('off')

if 'AREA NAME' in df.columns:
    district_stats = df['AREA NAME'].value_counts()
    
    stats_summary = f"""
LA POLICE DISTRICT SUMMARY

Total Districts: {len(district_stats)}
Total Crimes: {len(df):,}

Crime Statistics:
‚îú‚îÄ Highest: {district_stats.iloc[0]:,} crimes
‚îú‚îÄ Lowest: {district_stats.iloc[-1]:,} crimes  
‚îú‚îÄ Average: {district_stats.mean():.0f} crimes
‚îú‚îÄ Median: {district_stats.median():.0f} crimes
‚îî‚îÄ Std Dev: {district_stats.std():.0f} crimes

District Workload Distribution:
‚îú‚îÄ Top 25% districts handle:
‚îÇ   {district_stats.head(len(district_stats)//4).sum()/len(df)*100:.1f}% of crimes
‚îú‚îÄ Top 50% districts handle:
‚îÇ   {district_stats.head(len(district_stats)//2).sum()/len(df)*100:.1f}% of crimes
‚îî‚îÄ Bottom 25% districts handle:
    {district_stats.tail(len(district_stats)//4).sum()/len(df)*100:.1f}% of crimes

Workload Inequality:
‚îî‚îÄ Ratio (Highest/Lowest): 
    {district_stats.iloc[0]/district_stats.iloc[-1]:.1f}x difference
"""

    plt.text(0.05, 0.95, stats_summary, transform=plt.gca().transAxes, 
             fontsize=9, verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle="round,pad=0.4", facecolor="lightblue", alpha=0.8))

# 14. Resource Allocation Priority
plt.subplot(4, 4, 14)
if 'AREA NAME' in df.columns:
    district_counts = df['AREA NAME'].value_counts()
    
    # Categorize districts by crime volume
    high_priority = district_counts[district_counts > district_counts.quantile(0.8)]
    medium_priority = district_counts[(district_counts >= district_counts.quantile(0.4)) & 
                                    (district_counts <= district_counts.quantile(0.8))]
    low_priority = district_counts[district_counts < district_counts.quantile(0.4)]
    
    categories = ['High Priority\n(Top 20%)', 'Medium Priority\n(Middle 40%)', 'Low Priority\n(Bottom 40%)']
    counts = [len(high_priority), len(medium_priority), len(low_priority)]
    colors = ['red', 'orange', 'green']
    
    plt.pie(counts, labels=categories, autopct='%1.0f%%', colors=colors, startangle=90)
    plt.title('Resource Allocation Priority\n(District Categories)')

# 15. Crime Trend Analysis
plt.subplot(4, 4, 15)
if 'AREA NAME' in df.columns and 'year' in df.columns and len(df['year'].unique()) > 1:
    top_5_districts = df['AREA NAME'].value_counts().head(5).index
    yearly_trends = df[df['AREA NAME'].isin(top_5_districts)].groupby(['AREA NAME', 'year']).size().unstack(fill_value=0)
    
    for district in top_5_districts:
        if district in yearly_trends.index:
            plt.plot(yearly_trends.columns, yearly_trends.loc[district], 
                    label=district[:12] + '...', marker='o', linewidth=2)
    
    plt.xlabel('Year')
    plt.ylabel('Crime Count')
    plt.title('Crime Trends by District (Top 5)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'Insufficient temporal data\nfor trend analysis', 
             ha='center', va='center', transform=plt.gca().transAxes,
             bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow"))
    plt.title('Crime Trends Analysis')

# 16. Key Insights Summary
plt.subplot(4, 4, 16)
plt.axis('off')

insights_text = """
KEY INSIGHTS & RECOMMENDATIONS

üéØ FOCUS AREAS (Why Top 15 Analysis):
‚Ä¢ Pareto Principle applies to crime distribution
‚Ä¢ Top 15 districts handle majority of crimes
‚Ä¢ Limited resources require strategic focus
‚Ä¢ Statistical significance in high-crime areas

üìä RESOURCE ALLOCATION:
‚Ä¢ HIGH PRIORITY: Top 20% districts
  - Enhanced patrol presence
  - Specialized crime units
  - Community engagement programs

‚Ä¢ MEDIUM PRIORITY: Middle 40% districts  
  - Regular patrol schedules
  - Crime prevention programs
  - Inter-district coordination

‚Ä¢ LOW PRIORITY: Bottom 40% districts
  - Maintenance patrols
  - Community policing focus
  - Crime prevention education

üöî OPERATIONAL BENEFITS:
‚Ä¢ Targeted deployment strategies
‚Ä¢ Data-driven resource allocation  
‚Ä¢ Performance benchmarking
‚Ä¢ Inter-district comparison
"""

plt.text(0.05, 0.95, insights_text, transform=plt.gca().transAxes, 
         fontsize=9, verticalalignment='top',
         bbox=dict(boxstyle="round,pad=0.4", facecolor="lightgreen", alpha=0.8))

plt.tight_layout()
plt.show()

print(f"\n" + "="*80)
print("STRATEGIC INSIGHTS FOR LAW ENFORCEMENT:")
print("="*80)

if 'AREA NAME' in df.columns:
    district_stats = df['AREA NAME'].value_counts()
    
    print(f"üéØ RESOURCE ALLOCATION PRIORITIES:")
    
    # High priority districts
    high_priority = district_stats[district_stats > district_stats.quantile(0.8)]
    print(f"\nüî¥ HIGH PRIORITY DISTRICTS ({len(high_priority)} districts):")
    print(f"   Handle {high_priority.sum()/len(df)*100:.1f}% of all crimes")
    for i, (district, count) in enumerate(high_priority.head(10).items(), 1):
        print(f"   {i:2d}. {district:<30} {count:>6,} crimes")
    
    # Medium priority districts  
    medium_priority = district_stats[(district_stats >= district_stats.quantile(0.4)) & 
                                   (district_stats <= district_stats.quantile(0.8))]
    print(f"\nüü° MEDIUM PRIORITY DISTRICTS ({len(medium_priority)} districts):")
    print(f"   Handle {medium_priority.sum()/len(df)*100:.1f}% of all crimes")
    
    # Low priority districts
    low_priority = district_stats[district_stats < district_stats.quantile(0.4)]
    print(f"\nüü¢ LOW PRIORITY DISTRICTS ({len(low_priority)} districts):")
    print(f"   Handle {low_priority.sum()/len(df)*100:.1f}% of all crimes")
    
    print(f"\nüìà ACTIONABLE RECOMMENDATIONS:")
    print(f"   ‚Ä¢ Deploy 60% of resources to HIGH priority districts")
    print(f"   ‚Ä¢ Deploy 30% of resources to MEDIUM priority districts")
    print(f"   ‚Ä¢ Deploy 10% of resources to LOW priority districts")
    print(f"   ‚Ä¢ Focus predictive modeling on top 15-20 districts")
    print(f"   ‚Ä¢ Implement district-specific crime prevention strategies")
    print(f"   ‚Ä¢ Establish inter-district resource sharing protocols")

print(f"\n‚úÖ DISTRICT ANALYSIS COMPLETE!")
print(f"This comprehensive analysis enables data-driven police resource allocation and strategic planning.")