# California Arrest Data Analysis

This notebook analyzes California arrest data for 2009 and 2018 to answer questions about:
- Average county-level felony drug arrest rates
- Percentage changes in arrest rates
- Difference-in-Difference analysis

## Data Sources:
- `ca_arrests_2009.csv`: 2009 California arrest data by county
- `ca_arrests_2018.csv`: 2018 California arrest data by county  
- `nhgis_county_populations.csv`: County population data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load the datasets
arrests_2009 = pd.read_csv(r'D:\JAVA\CODE\PYTHON\ML\ca_arrests_2009.csv')
arrests_2018 = pd.read_csv(r'D:\JAVA\CODE\PYTHON\ML\ca_arrests_2018.csv')
populations = pd.read_csv(r'D:\JAVA\CODE\PYTHON\ML\nhgis_county_populations.csv')

print("Data loaded successfully!")
print(f"2009 arrests data shape: {arrests_2009.shape}")
print(f"2018 arrests data shape: {arrests_2018.shape}")
print(f"Population data shape: {populations.shape}")

# Display first few rows to understand the data structure
print("\n2009 Arrests Data Sample:")
print(arrests_2009.head())

print("\n2018 Arrests Data Sample:")
print(arrests_2018.head())

print("\nPopulation Data Sample:")
print(populations.head())

In [None]:
# Data preparation and filtering for California
# Filter population data for California (2009 and 2018 periods)
ca_pop_2009 = populations[(populations['STATE'] == 'California') & 
                         (populations['YEAR'] == '2005-2009')].copy()
ca_pop_2018 = populations[(populations['STATE'] == 'California') & 
                         (populations['YEAR'] == '2014-2018')].copy()

print("California population data filtered:")
print(f"CA 2009 population data: {ca_pop_2009.shape}")
print(f"CA 2018 population data: {ca_pop_2018.shape}")

# Clean county names for matching (remove 'County' suffix if present)
arrests_2009['COUNTY_CLEAN'] = arrests_2009['COUNTY'].str.replace(' County', '').str.strip()
arrests_2018['COUNTY_CLEAN'] = arrests_2018['COUNTY'].str.replace(' County', '').str.strip()
ca_pop_2009['COUNTY_CLEAN'] = ca_pop_2009['COUNTY'].str.replace(' County', '').str.strip()
ca_pop_2018['COUNTY_CLEAN'] = ca_pop_2018['COUNTY'].str.replace(' County', '').str.strip()

print("\nCounty names cleaned for matching")
print("Sample county names in arrests data:", arrests_2009['COUNTY_CLEAN'].head().tolist())
print("Sample county names in population data:", ca_pop_2009['COUNTY_CLEAN'].head().tolist())

In [None]:
# Merge arrest data with population data
# For 2009
merged_2009 = arrests_2009.merge(ca_pop_2009[['COUNTY_CLEAN', 'total_population']], 
                                 on='COUNTY_CLEAN', how='inner')

# For 2018
merged_2018 = arrests_2018.merge(ca_pop_2018[['COUNTY_CLEAN', 'total_population']], 
                                 on='COUNTY_CLEAN', how='inner')

print("Data merged successfully!")
print(f"Merged 2009 data shape: {merged_2009.shape}")
print(f"Merged 2018 data shape: {merged_2018.shape}")

# Calculate arrest rates per 1,000 population
merged_2009['felony_drug_rate'] = (merged_2009['F_DRUGOFF'] / merged_2009['total_population']) * 1000
merged_2009['violent_rate'] = (merged_2009['VIOLENT'] / merged_2009['total_population']) * 1000

merged_2018['felony_drug_rate'] = (merged_2018['F_DRUGOFF'] / merged_2018['total_population']) * 1000
merged_2018['violent_rate'] = (merged_2018['VIOLENT'] / merged_2018['total_population']) * 1000

print("\nArrest rates calculated (per 1,000 population)")
print("2009 sample rates:")
print(merged_2009[['COUNTY', 'felony_drug_rate', 'violent_rate']].head())

print("\n2018 sample rates:")
print(merged_2018[['COUNTY', 'felony_drug_rate', 'violent_rate']].head())

## Question 1: Average County-Level Felony Drug Arrest Rate for 2009

In [None]:
# Question 1: Average county-level felony drug arrest rate for 2009 (in arrests per 1,000)
avg_felony_drug_2009 = merged_2009['felony_drug_rate'].mean()

print(f"Average felony drug arrest rate for 2009: {avg_felony_drug_2009}")
print(f"Rounded to 3 significant figures: {avg_felony_drug_2009:.3g}")

# For verification, let's also show the distribution
print(f"\nStatistics for 2009 felony drug arrest rates:")
print(f"Mean: {avg_felony_drug_2009:.6f}")
print(f"Median: {merged_2009['felony_drug_rate'].median():.6f}")
print(f"Min: {merged_2009['felony_drug_rate'].min():.6f}")
print(f"Max: {merged_2009['felony_drug_rate'].max():.6f}")

print(f"\n*** ANSWER TO QUESTION 1: {avg_felony_drug_2009:.3g} ***")

## Question 2: Average County-Level Felony Drug Arrest Rate for 2018

In [None]:
# Question 2: Average county-level felony drug arrest rate for 2018 (in arrests per 1,000)
avg_felony_drug_2018 = merged_2018['felony_drug_rate'].mean()

print(f"Average felony drug arrest rate for 2018: {avg_felony_drug_2018}")
print(f"Rounded to 3 significant figures: {avg_felony_drug_2018:.3g}")

# For verification, let's also show the distribution
print(f"\nStatistics for 2018 felony drug arrest rates:")
print(f"Mean: {avg_felony_drug_2018:.6f}")
print(f"Median: {merged_2018['felony_drug_rate'].median():.6f}")
print(f"Min: {merged_2018['felony_drug_rate'].min():.6f}")
print(f"Max: {merged_2018['felony_drug_rate'].max():.6f}")

print(f"\n*** ANSWER TO QUESTION 2: {avg_felony_drug_2018:.3g} ***")

## Questions 3 & 4: Percentage Changes in Arrest Rates (2009 to 2018)

In [None]:
# Merge 2009 and 2018 data for percentage change calculations
change_data = merged_2009[['COUNTY_CLEAN', 'felony_drug_rate', 'violent_rate']].merge(
    merged_2018[['COUNTY_CLEAN', 'felony_drug_rate', 'violent_rate']], 
    on='COUNTY_CLEAN', 
    suffixes=('_2009', '_2018'),
    how='inner'
)

print(f"Counties with data for both years: {len(change_data)}")

# Calculate percentage changes
change_data['felony_drug_pct_change'] = ((change_data['felony_drug_rate_2018'] - change_data['felony_drug_rate_2009']) / 
                                        change_data['felony_drug_rate_2009']) * 100

change_data['violent_pct_change'] = ((change_data['violent_rate_2018'] - change_data['violent_rate_2009']) / 
                                    change_data['violent_rate_2009']) * 100

print("\nPercentage changes calculated")
print(change_data[['COUNTY_CLEAN', 'felony_drug_pct_change', 'violent_pct_change']].head())

In [None]:
# Question 3: Average percentage change in felony drug arrest rate between 2009 and 2018
avg_felony_drug_change = change_data['felony_drug_pct_change'].mean()

print(f"Average percentage change in felony drug arrest rate (2009-2018): {avg_felony_drug_change}")
print(f"Rounded to 3 significant figures: {avg_felony_drug_change:.3g}%")

print(f"\nStatistics for felony drug arrest rate changes:")
print(f"Mean: {avg_felony_drug_change:.6f}%")
print(f"Median: {change_data['felony_drug_pct_change'].median():.6f}%")
print(f"Min: {change_data['felony_drug_pct_change'].min():.6f}%")
print(f"Max: {change_data['felony_drug_pct_change'].max():.6f}%")

print(f"\n*** ANSWER TO QUESTION 3: {avg_felony_drug_change:.3g} ***")

In [None]:
# Question 4: Average percentage change in violent crime arrest rate between 2009 and 2018
avg_violent_change = change_data['violent_pct_change'].mean()

print(f"Average percentage change in violent arrest rate (2009-2018): {avg_violent_change}")
print(f"Rounded to 3 significant figures: {avg_violent_change:.3g}%")

print(f"\nStatistics for violent arrest rate changes:")
print(f"Mean: {avg_violent_change:.6f}%")
print(f"Median: {change_data['violent_pct_change'].median():.6f}%")
print(f"Min: {change_data['violent_pct_change'].min():.6f}%")
print(f"Max: {change_data['violent_pct_change'].max():.6f}%")

print(f"\n*** ANSWER TO QUESTION 4: {avg_violent_change:.3g} ***")

## Questions 5-7: Difference-in-Difference Analysis
For these questions, we need to classify counties into "high" and "low" 2009 drug arrest rate groups and analyze the difference in violent crime changes between these groups.

In [None]:
# Difference-in-Difference Analysis Setup
# Classify counties into high and low 2009 drug arrest rate groups

# Calculate median drug arrest rate for 2009 to split groups
median_drug_2009 = change_data['felony_drug_rate_2009'].median()

print(f"Median felony drug arrest rate in 2009: {median_drug_2009:.3f}")

# Create high/low groups based on median split
change_data['drug_group'] = change_data['felony_drug_rate_2009'].apply(
    lambda x: 'high' if x >= median_drug_2009 else 'low'
)

# Count counties in each group
group_counts = change_data['drug_group'].value_counts()
print(f"\nGroup distribution:")
print(f"Low 2009 drug arrest rate counties: {group_counts['low']}")
print(f"High 2009 drug arrest rate counties: {group_counts['high']}")

# Show some examples
print(f"\nSample of low drug arrest rate counties (2009):")
low_sample = change_data[change_data['drug_group'] == 'low'][['COUNTY_CLEAN', 'felony_drug_rate_2009']].head()
print(low_sample)

print(f"\nSample of high drug arrest rate counties (2009):")
high_sample = change_data[change_data['drug_group'] == 'high'][['COUNTY_CLEAN', 'felony_drug_rate_2009']].head()
print(high_sample)

In [None]:
# Question 5: Percentage change in violent arrest rates for LOW 2009 drug arrest rates
low_violent_change = change_data[change_data['drug_group'] == 'low']['violent_pct_change'].mean()

print(f"Average violent arrest rate change for LOW 2009 drug counties: {low_violent_change}")
print(f"Rounded to 3 significant figures: {low_violent_change:.3g}%")

print(f"\n*** ANSWER TO QUESTION 5: {low_violent_change:.3g} ***")

# Question 6: Percentage change in violent arrest rates for HIGH 2009 drug arrest rates
high_violent_change = change_data[change_data['drug_group'] == 'high']['violent_pct_change'].mean()

print(f"\nAverage violent arrest rate change for HIGH 2009 drug counties: {high_violent_change}")
print(f"Rounded to 3 significant figures: {high_violent_change:.3g}%")

print(f"\n*** ANSWER TO QUESTION 6: {high_violent_change:.3g} ***")

# Show detailed statistics for verification
print(f"\nDetailed statistics:")
print(f"Low drug group violent change - Mean: {low_violent_change:.6f}%, Count: {len(change_data[change_data['drug_group'] == 'low'])}")
print(f"High drug group violent change - Mean: {high_violent_change:.6f}%, Count: {len(change_data[change_data['drug_group'] == 'high'])}")

In [None]:
# Question 7: Difference-in-Difference estimate
# The DiD estimate is the difference between the high and low groups' violent crime changes
did_estimate = high_violent_change - low_violent_change

print(f"Difference-in-Difference Calculation:")
print(f"High drug group violent change: {high_violent_change:.6f}%")
print(f"Low drug group violent change: {low_violent_change:.6f}%")
print(f"Difference (High - Low): {did_estimate:.6f}%")
print(f"Rounded to 3 significant figures: {did_estimate:.3g}%")

print(f"\n*** ANSWER TO QUESTION 7: {did_estimate:.3g} ***")

# Summary of all answers
print("\n" + "="*50)
print("SUMMARY OF ALL ANSWERS")
print("="*50)
print(f"Question 1 - Average felony drug arrest rate 2009: {avg_felony_drug_2009:.3g}")
print(f"Question 2 - Average felony drug arrest rate 2018: {avg_felony_drug_2018:.3g}")
print(f"Question 3 - Average % change in felony drug arrests: {avg_felony_drug_change:.3g}")
print(f"Question 4 - Average % change in violent arrests: {avg_violent_change:.3g}")
print(f"Question 5 - Violent change for low drug counties: {low_violent_change:.3g}")
print(f"Question 6 - Violent change for high drug counties: {high_violent_change:.3g}")
print(f"Question 7 - Difference-in-Difference estimate: {did_estimate:.3g}")
print("="*50)