In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline


In [None]:
# Load the crime data
print("Loading crime data...")
df = pd.read_csv('crime_data.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
df.head()


In [None]:
# Basic dataset information
print("="*50)
print("DATASET INFORMATION")
print("="*50)
print(f"Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nData types:")
print(df.dtypes)
print("\nDataset info:")
df.info()


In [None]:
# Missing values analysis
print("="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
missing_df[missing_df['Missing Count'] > 0].plot(x='Column', y='Missing Count', kind='bar')
plt.title('Missing Values Count')
plt.xticks(rotation=45)
plt.tight_layout()

plt.subplot(1, 2, 2)
missing_df[missing_df['Missing Count'] > 0].plot(x='Column', y='Missing Percentage', kind='bar')
plt.title('Missing Values Percentage')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()


In [None]:
# Statistical summary for numerical columns
print("="*50)
print("STATISTICAL SUMMARY")
print("="*50)

numerical_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print(f"Numerical columns: {list(numerical_cols)}")
print(f"Categorical columns: {list(categorical_cols)}")

if len(numerical_cols) > 0:
    print("\nNumerical columns statistics:")
    print(df[numerical_cols].describe())
    
if len(categorical_cols) > 0:
    print("\nCategorical columns statistics:")
    print(df[categorical_cols].describe())


In [None]:
# Unique values analysis
print("="*50)
print("UNIQUE VALUES ANALYSIS")
print("="*50)

for col in df.columns:
    unique_count = df[col].nunique()
    unique_percent = (unique_count / len(df)) * 100
    print(f"{col}: {unique_count} unique values ({unique_percent:.2f}%)")
    
    # Show sample values for categorical columns with reasonable number of unique values
    if df[col].dtype == 'object' and unique_count <= 20:
        print(f"  Sample values: {df[col].unique()[:10].tolist()}")
    elif df[col].dtype == 'object' and unique_count > 20:
        print(f"  Sample values: {df[col].unique()[:5].tolist()} ... (showing first 5)")
    print()


In [None]:
# Data distribution visualizations
print("="*50)
print("DATA DISTRIBUTION VISUALIZATIONS")
print("="*50)

# Plot distributions for numerical columns
if len(numerical_cols) > 0:
    n_numerical = len(numerical_cols)
    fig, axes = plt.subplots(2, max(2, (n_numerical + 1) // 2), figsize=(15, 8))
    fig.suptitle('Distribution of Numerical Columns', fontsize=16)
    axes = axes.flatten() if n_numerical > 1 else [axes]
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, edgecolor='black')
            axes[i].set_title(f'{col} Distribution')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide unused subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# Plot top categories for categorical columns
if len(categorical_cols) > 0:
    n_categorical = min(len(categorical_cols), 6)  # Show max 6 categorical columns
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Top Categories in Categorical Columns', fontsize=16)
    axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols[:n_categorical]):
        top_values = df[col].value_counts().head(10)
        axes[i].bar(range(len(top_values)), top_values.values)
        axes[i].set_title(f'Top 10 values in {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')
        axes[i].set_xticks(range(len(top_values)))
        axes[i].set_xticklabels(top_values.index, rotation=45, ha='right')
    
    # Hide unused subplots
    for i in range(n_categorical, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation analysis for numerical columns
print("="*50)
print("CORRELATION ANALYSIS")
print("="*50)

if len(numerical_cols) > 1:
    correlation_matrix = df[numerical_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .5})
    plt.title('Correlation Matrix of Numerical Variables')
    plt.tight_layout()
    plt.show()
    
    # Show highly correlated pairs
    print("\nHighly correlated pairs (|correlation| > 0.5):")
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > 0.5:
                high_corr.append((correlation_matrix.columns[i], 
                                correlation_matrix.columns[j], 
                                corr_value))
    
    if high_corr:
        for col1, col2, corr in high_corr:
            print(f"{col1} - {col2}: {corr:.3f}")
    else:
        print("No highly correlated pairs found.")
        
elif len(numerical_cols) == 1:
    print(f"Only one numerical column found: {numerical_cols[0]}")
else:
    print("No numerical columns found for correlation analysis.")


In [None]:
# Data quality assessment and outlier detection
print("="*50)
print("DATA QUALITY ASSESSMENT")
print("="*50)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

# Outlier detection using IQR method for numerical columns
if len(numerical_cols) > 0:
    print("\nOutlier detection (using IQR method):")
    
    fig, axes = plt.subplots(1, min(len(numerical_cols), 4), figsize=(15, 5))
    if len(numerical_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(numerical_cols[:4]):  # Show max 4 boxplots
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
        print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.2f}%)")
        
        if i < len(axes):
            axes[i].boxplot(df[col].dropna())
            axes[i].set_title(f'{col} Boxplot')
            axes[i].set_ylabel(col)
    
    plt.tight_layout()
    plt.show()

# Data completeness summary
print("\nData completeness summary:")
completeness = ((df.count() / len(df)) * 100).round(2)
print(completeness.sort_values(ascending=False))

print("\n" + "="*50)
print("EXPLORATION COMPLETE")
print("="*50)
print(f"Dataset contains {len(df)} rows and {len(df.columns)} columns")
print(f"Missing values: {df.isnull().sum().sum()} total")
print(f"Duplicate rows: {duplicates}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
# STEP 1: Drop the identified problematic columns
print("="*50)
print("COLUMN DROPPING & SAMPLING")
print("="*50)

columns_to_drop = ['Crm Cd 1', 'Crm Cd 3', 'Crm Cd 4', 'Crm Cd 2', 'Cross Street']
print(f"Dropping columns: {columns_to_drop}")

# Drop columns
df_cleaned = df.drop(columns=columns_to_drop)
print(f"Original shape: {df.shape}")
print(f"After dropping columns: {df_cleaned.shape}")

# Check memory reduction
original_memory = df.memory_usage(deep=True).sum() / 1024**2
cleaned_memory = df_cleaned.memory_usage(deep=True).sum() / 1024**2
print(f"Memory reduction: {original_memory:.2f} MB ‚Üí {cleaned_memory:.2f} MB")

print(f"\nRemaining columns: {df_cleaned.columns.tolist()}")
print(f"Number of columns: {len(df_cleaned.columns)}")


In [None]:
# STEP 2: Comprehensive Stratified Sampling (200k rows)
print("\n" + "="*50)
print("STRATIFIED SAMPLING - 200K ROWS")
print("="*50)

from sklearn.model_selection import train_test_split
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Target sample size
target_size = 201000
sample_ratio = target_size / len(df_cleaned)
print(f"Sampling ratio: {sample_ratio:.3f} ({sample_ratio*100:.1f}%)")

# Strategy: Use stratified sampling based on key variables to maintain distributions
# Choose stratification variable with good distribution
stratify_column = 'AREA NAME'  # Good categorical distribution (21 unique values)

# Perform stratified sampling
df_sample, _ = train_test_split(
    df_cleaned, 
    test_size=1-sample_ratio, 
    random_state=42,
    stratify=df_cleaned[stratify_column]
)

print(f"Sample size: {len(df_sample)} rows")
print(f"Percentage of original data: {len(df_sample)/len(df_cleaned)*100:.2f}%")

# Memory comparison
sample_memory = df_sample.memory_usage(deep=True).sum() / 1024**2
print(f"Memory usage: {cleaned_memory:.2f} MB ‚Üí {sample_memory:.2f} MB")
print(f"Memory reduction: {(1-sample_memory/cleaned_memory)*100:.1f}%")


In [None]:
# STEP 3: Validate Sampling Quality - Ensure distributions are maintained
print("\n" + "="*50)
print("SAMPLING VALIDATION")
print("="*50)

# Compare missing value percentages
print("MISSING VALUES COMPARISON:")
print("Column" + " " * 15 + "Original %" + " " * 5 + "Sample %" + " " * 5 + "Difference")
print("-" * 60)

for col in df_sample.columns:
    orig_missing = (df_cleaned[col].isnull().sum() / len(df_cleaned)) * 100
    sample_missing = (df_sample[col].isnull().sum() / len(df_sample)) * 100
    diff = abs(orig_missing - sample_missing)
    print(f"{col:<20} {orig_missing:>8.2f}% {sample_missing:>10.2f}% {diff:>10.2f}%")

# Compare key categorical distributions
print(f"\nCATEGORICAL DISTRIBUTION COMPARISON:")
print(f"Stratification variable: {stratify_column}")
print("Category" + " " * 15 + "Original %" + " " * 5 + "Sample %" + " " * 5 + "Difference")
print("-" * 65)

orig_dist = df_cleaned[stratify_column].value_counts(normalize=True) * 100
sample_dist = df_sample[stratify_column].value_counts(normalize=True) * 100

for category in orig_dist.index:
    orig_pct = orig_dist[category]
    sample_pct = sample_dist[category]
    diff = abs(orig_pct - sample_pct)
    print(f"{category:<20} {orig_pct:>8.2f}% {sample_pct:>10.2f}% {diff:>10.2f}%")

# Compare basic statistics for numerical columns
print(f"\nNUMERICAL STATISTICS COMPARISON:")
numerical_cols_cleaned = df_sample.select_dtypes(include=[np.number]).columns
for col in numerical_cols_cleaned[:5]:  # Show first 5 numerical columns
    orig_mean = df_cleaned[col].mean()
    sample_mean = df_sample[col].mean()
    diff_pct = abs(orig_mean - sample_mean) / orig_mean * 100
  
    print(f"{col}: Original mean={orig_mean:.2f}, Sample mean={sample_mean:.2f}, Diff={diff_pct:.2f}%")

print(f"\n‚úÖ SAMPLING SUCCESSFUL!")
print(f"‚úÖ Final dataset: {len(df_sample)} rows, {len(df_sample.columns)} columns")
print(f"‚úÖ Memory usage: {sample_memory:.2f} MB")
print(f"‚úÖ Distributions preserved with minimal deviation")


In [None]:
# STEP 4: Save the cleaned and sampled dataset
print("\n" + "="*50)
print("SAVING FINAL DATASET")
print("="*50)

# Save to CSV
output_filename = f'crime_data_sample_{target_size}.csv'
df_sample.to_csv(output_filename, index=False)
print(f"‚úÖ Dataset saved as: {output_filename}")

# Final summary
print(f"\nüéØ FINAL DATASET SUMMARY:")
print(f"üìä Original dataset: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"üóëÔ∏è  Dropped {len(columns_to_drop)} problematic columns")
print(f"üì¶ Sampled dataset: {len(df_sample):,} rows √ó {len(df_sample.columns)} columns")
print(f"üíæ Memory usage: {sample_memory:.2f} MB")
print(f"üìà Data reduction: {(1-len(df_sample)/len(df))*100:.1f}%")
print(f"üéØ Perfect for ML project - manageable size with preserved distributions!")

# Display final dataset info
print(f"\nüìã Ready for ML Pipeline:")
print(f"‚úì Clean dataset with {len(df_sample.columns)} features")
print(f"‚úì No redundant columns")
print(f"‚úì Representative sample")
print(f"‚úì Manageable computational requirements")
print(f"‚úì Suitable for classification/regression tasks")

# Show first few rows of final dataset
print(f"\nüîç Preview of final dataset:")
print(df_sample.head())


In [None]:
# ML Project Implementation Plan
print("="*60)
print("ML PROJECT IMPLEMENTATION ROADMAP")
print("="*60)

# Let's analyze the target variables for our ML problems
print("üéØ TARGET VARIABLE ANALYSIS:")
print("-" * 40)

# Problem 1: Crime Type Classification
print("1. CRIME TYPE CLASSIFICATION:")
print(f"   Target: Crm Cd Desc")
print(f"   Classes: {df_sample['Crm Cd Desc'].nunique()} unique crime types")
print(f"   Top 5 most common crimes:")
top_crimes = df_sample['Crm Cd Desc'].value_counts().head()
for crime, count in top_crimes.items():
    percentage = (count / len(df_sample)) * 100
    print(f"     - {crime}: {count} ({percentage:.1f}%)")

print(f"\n   Class imbalance ratio: {top_crimes.iloc[0] / top_crimes.iloc[-1]:.1f}:1")
print(f"   Challenge level: HIGH (Multi-class with imbalance)")

# Problem 2: Crime Severity Prediction  
print(f"\n2. CRIME SEVERITY PREDICTION:")
print(f"   Target: Part 1-2")
severity_dist = df_sample['Part 1-2'].value_counts().sort_index()
print(f"   Classes: {len(severity_dist)} (Binary classification)")
for severity, count in severity_dist.items():
    percentage = (count / len(df_sample)) * 100
    severity_name = "Serious crimes" if severity == 1 else "Less serious crimes"
    print(f"     - Part {severity} ({severity_name}): {count} ({percentage:.1f}%)")

balance_ratio = severity_dist.iloc[0] / severity_dist.iloc[1]
print(f"   Balance ratio: {balance_ratio:.2f}:1")
print(f"   Challenge level: MEDIUM (Binary with slight imbalance)")

# Problem 3: Weapon Usage Prediction
print(f"\n3. WEAPON USAGE PREDICTION:")
print(f"   Target: Weapon Used (derived from Weapon Desc)")
weapon_used = df_sample['Weapon Desc'].notna()
weapon_dist = weapon_used.value_counts()
print(f"   Classes: 2 (Binary classification)")
for used, count in weapon_dist.items():
    percentage = (count / len(df_sample)) * 100
    label = "Weapon used" if used else "No weapon"
    print(f"     - {label}: {count} ({percentage:.1f}%)")

weapon_ratio = weapon_dist.iloc[0] / weapon_dist.iloc[1]
print(f"   Balance ratio: {weapon_ratio:.2f}:1")
print(f"   Challenge level: HIGH (Highly imbalanced)")

print(f"\nüìä RECOMMENDATION: Start with Crime Type Classification")
print(f"   ‚úÖ Most complex and academically interesting")
print(f"   ‚úÖ Real-world practical importance")
print(f"   ‚úÖ Rich feature engineering opportunities")
print(f"   ‚úÖ Multiple evaluation metrics possible")


In [None]:
# Complete ML Pipeline Design
print("\n" + "="*60)
print("COMPLETE ML PIPELINE DESIGN")
print("="*60)

print("üîß PREPROCESSING PIPELINE:")
print("-" * 30)
preprocessing_steps = [
    "1. Handle missing values (imputation strategies)",
    "2. Temporal feature extraction (hour, day, month, season)",
    "3. Categorical encoding (One-hot, Label encoding)",
    "4. Geographical feature engineering (area clustering)",
    "5. Numerical feature scaling (StandardScaler)",
    "6. Feature selection (importance-based)",
    "7. Handle class imbalance (SMOTE, class weights)"
]

for step in preprocessing_steps:
    print(f"   {step}")

print(f"\nü§ñ MACHINE LEARNING MODELS TO COMPARE:")
print("-" * 40)

models_info = {
    "Random Forest": {
        "type": "Ensemble", 
        "pros": "Handles missing values, feature importance, non-linear",
        "cons": "Can overfit with many features"
    },
    "XGBoost": {
        "type": "Gradient Boosting",
        "pros": "High performance, handles imbalance, feature importance",
        "cons": "Requires hyperparameter tuning"
    },
    "Logistic Regression": {
        "type": "Linear",
        "pros": "Fast, interpretable, good baseline",
        "cons": "Assumes linear relationships"
    },
    "Support Vector Machine": {
        "type": "Kernel-based",
        "pros": "Good for high-dimensional data, non-linear kernels",
        "cons": "Slow on large datasets"
    },
    "Neural Network": {
        "type": "Deep Learning",
        "pros": "Can capture complex patterns, flexible",
        "cons": "Requires more data, black box"
    },
    "Naive Bayes": {
        "type": "Probabilistic",
        "pros": "Fast, works well with categorical data",
        "cons": "Assumes feature independence"
    }
}

for i, (model, info) in enumerate(models_info.items(), 1):
    print(f"{i}. {model} ({info['type']})")
    print(f"   ‚úÖ Pros: {info['pros']}")
    print(f"   ‚ùå Cons: {info['cons']}")
    print()

print("üìä EVALUATION STRATEGY:")
print("-" * 25)
evaluation_metrics = [
    "Accuracy (overall correctness)",
    "F1-Score (macro & weighted for imbalanced classes)",
    "Precision & Recall (per class)",
    "Confusion Matrix (error analysis)",
    "Classification Report (detailed per-class metrics)",
    "ROC-AUC (if converted to binary problems)",
    "Training Time (computational efficiency)",
    "Cross-validation (5-fold stratified)"
]

for metric in evaluation_metrics:
    print(f"   ‚Ä¢ {metric}")

print(f"\nüéØ RESEARCH CONTRIBUTION:")
print("-" * 25)
contributions = [
    "Compare multiple ML approaches on crime prediction",
    "Analyze feature importance for crime type prediction",
    "Handle real-world data challenges (missing values, imbalance)",
    "Evaluate temporal and spatial pattern significance",
    "Provide actionable insights for law enforcement"
]

for contrib in contributions:
    print(f"   ‚Ä¢ {contrib}")

print(f"\nüìö BASELINE STUDIES TO COMPARE:")
print("-" * 35)
baselines = [
    "Crime prediction using machine learning (recent papers)",
    "Temporal crime pattern analysis studies",
    "Spatial crime hotspot prediction research",
    "Ensemble methods for crime classification",
    "Imbalanced learning techniques in criminology"
]

for baseline in baselines:
    print(f"   ‚Ä¢ {baseline}")

print(f"\nüéì ACADEMIC PAPER STRUCTURE:")
print("-" * 30)
paper_sections = [
    "1. Introduction (ÿ£ŸáŸÖŸäÿ© ÿßŸÑÿ™ŸÜÿ®ÿ§ ÿ®ÿßŸÑÿ¨ÿ±ÿßÿ¶ŸÖ)",
    "2. Literature Review (ÿßŸÑÿØÿ±ÿßÿ≥ÿßÿ™ ÿßŸÑÿ≥ÿßÿ®ŸÇÿ©)", 
    "3. Dataset Description (ŸàÿµŸÅ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™)",
    "4. Methodology (ÿßŸÑŸÖŸÜŸáÿ¨Ÿäÿ© ŸàÿßŸÑŸÜŸÖÿßÿ∞ÿ¨)",
    "5. Experiments & Results (ÿßŸÑÿ™ÿ¨ÿßÿ±ÿ® ŸàÿßŸÑŸÜÿ™ÿßÿ¶ÿ¨)",
    "6. Discussion & Analysis (ÿßŸÑÿ™ÿ≠ŸÑŸäŸÑ ŸàÿßŸÑŸÖŸÜÿßŸÇÿ¥ÿ©)",
    "7. Conclusion (ÿßŸÑÿÆŸÑÿßÿµÿ© ŸàÿßŸÑÿ™ŸàÿµŸäÿßÿ™)"
]

for section in paper_sections:
    print(f"   {section}")

print(f"\n‚úÖ PROJECT FEASIBILITY: EXCELLENT")
print(f"   üìä Rich dataset with real-world complexity")
print(f"   üéØ Clear problem definition and practical importance")
print(f"   üîß Multiple technical challenges to address")
print(f"   üìà Strong potential for meaningful results")
print(f"   üìö Sufficient literature for comparison")
