# (1) Clean The CAD Data

In [152]:
#imports
import pandas as pd
from pathlib import Path

In [153]:
# Bring in the CAD Data and select relevant columns
CAD_data = pd.read_csv("data/call_data_from_CAD.csv")
CAD_data = CAD_data[["IncidentNumber", "Call_Created_Time", "Call_Source", "InitialIncidentTypeDescription", "Disposition",
                     "PrimaryUnitCallSign", "RespondingUnitCallSign", "IsPrimary", "Call_First_Dispatched_Time", "Call_First_On_Scene"]].copy()

# Convert Date Time objects
CAD_data["Call_Created_Time"] = pd.to_datetime(CAD_data['Call_Created_Time'], errors='coerce')
CAD_data["Call_First_Dispatched_Time"] = pd.to_datetime(CAD_data['Call_First_Dispatched_Time'], errors='coerce')
CAD_data["Call_First_On_Scene"] = pd.to_datetime(CAD_data['Call_First_On_Scene'], errors='coerce')

In [154]:
CAD_data = CAD_data[(CAD_data["Call_Created_Time"].dt.year >= 2017) & (CAD_data["Call_Created_Time"].dt.year <= 2021)]
#CAD_data

### Standardize Cahoots identifiers 

In [155]:
# Standardize Cahoots identifiers 
cahoots_identifiers = r"1J77\s*|3J79\s*|3J78\s*|3J77\s*|4J79\s*|3J81\s*|3J76\s*|2J28\s*|2J29\s*|CAHOOT\s*|CAHOT\s*|CAHO\s*"

CAD_data["PrimaryUnitCallSign"] = CAD_data["PrimaryUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)
CAD_data["RespondingUnitCallSign"] = CAD_data["RespondingUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)

# Create an identifier for Cahoots involvement 
CAD_data['Cahoots_related'] = ((CAD_data['PrimaryUnitCallSign'] == 'CAHOOT') | (CAD_data['RespondingUnitCallSign'] == 'CAHOOT')).astype(int)

### Remove Disregards, duplicates, referrals and cancellations

In [156]:
# Disregards
CAD_data = CAD_data[CAD_data['Disposition'] != 'DISREGARD']
CAD_data = CAD_data[CAD_data['Disposition'] != 'DISREGARDED BY DISPATCH']
CAD_data = CAD_data[CAD_data['Disposition'] != 'DISREGARDED BY PATROL SUPERVISOR']

# Duplicate or no Dispatch
CAD_data = CAD_data[CAD_data['Disposition'] != 'ACCIDENTALLY CHOSE NEW EVENT']
CAD_data = CAD_data[CAD_data['Disposition'] != 'MOTOR VEHICLE ACCIDENT - NO DISPATCH']
CAD_data = CAD_data[CAD_data['Disposition'] != 'QUALITY OF LIFE - NO DISPATCH']
CAD_data = CAD_data[CAD_data['Disposition'] != 'UNABLE TO DISPATCH']
CAD_data = CAD_data[CAD_data['Disposition'] != 'WILL CALL BACK']
CAD_data = CAD_data[~(CAD_data["PrimaryUnitCallSign"].isna()) & ~(CAD_data["RespondingUnitCallSign"].isna())]

# Cancellations
CAD_data = CAD_data[CAD_data['Disposition'] != 'NO ACTION TAKEN']
CAD_data = CAD_data[CAD_data['Disposition'] != 'CANCEL WHILE ENROUTE']
CAD_data = CAD_data[CAD_data['Disposition'] != 'RESOLVED']
CAD_data = CAD_data[CAD_data['Disposition'] != 'CANCELED REPORT NUMBER']
CAD_data = CAD_data[CAD_data['Disposition'] != 'CANCEL FIRE UNIT FROM CALL']

# Referrals and relays
CAD_data = CAD_data[CAD_data['Disposition'] != 'REFERRED TO OTHER AGENCY']
CAD_data = CAD_data[CAD_data['Disposition'] != 'RELAYED TO UNIVERSITY OF OREGON POLICE']
CAD_data = CAD_data[CAD_data['Disposition'] != 'RELAYED TO OREGON STATE POLICE']
CAD_data = CAD_data[CAD_data['Disposition'] != 'RELAYED TO LANE COUNTY SHERIFFS OFFICE']
CAD_data = CAD_data[CAD_data['Disposition'] != 'RELAYED TO PARKING CONTROL']

#CAD_data

In [157]:
CAD_data = CAD_data.sort_values(by="Call_Created_Time")
output_dir = Path("data/cleaned_data")
output_dir.mkdir(parents=True, exist_ok=True)

# Save the cleaned DataFrame 
output_path = output_dir / "cleaned_CAD_data.csv"
CAD_data.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to data\cleaned_data\cleaned_CAD_data.csv


# Create CAD Diversions Dataset

In [158]:
import pandas as pd
CAD_data = pd.read_csv("data\cleaned_data\cleaned_CAD_data.csv")
#CAD_data

### Remove all call types with 0 overlap between Police and Cahoots

In [159]:
cahoots_related = CAD_data[CAD_data['Cahoots_related'] == 1]
police_handled = CAD_data[CAD_data['Cahoots_related'] == 0]

# Unique incident types
cahoots_types = cahoots_related['InitialIncidentTypeDescription'].unique()
police_types = police_handled['InitialIncidentTypeDescription'].unique()

# Incident types never responded to by Cahoots
never_cahoots_type = list(set(police_types) - set(cahoots_types))

# Incident types never responded to by police
never_police_type = list(set(cahoots_types) - set(police_types))

# Unique dispositions
cahoots_disp = cahoots_related['Disposition'].unique()
police_disp = police_handled['Disposition'].unique()

# Dispositions never responded to by Cahoots
never_cahoots_disp = list(set(police_disp) - set(cahoots_disp))

# Dispositions never responded to by police
never_police_disp = list(set(cahoots_disp) - set(police_disp))

# Filter out incidents and dispositions never responded to by Cahoots or police
CAD_data_diversions = CAD_data[
    ~CAD_data['InitialIncidentTypeDescription'].isin(never_cahoots_type) &
    ~CAD_data['InitialIncidentTypeDescription'].isin(never_police_type)
]
CAD_data_diversions = CAD_data_diversions[
    ~CAD_data_diversions['Disposition'].isin(never_cahoots_disp) &
    ~CAD_data_diversions['Disposition'].isin(never_police_disp)
]
#CAD_data_diversions

### Remove call types below composite score threshold

In [160]:
from scipy.stats import zscore
import numpy as np

# Calculate total calls, Cahoots calls, and police calls for each type in CAD_data_diversions
total_calls = CAD_data_diversions['InitialIncidentTypeDescription'].value_counts()
cahoots_calls = CAD_data_diversions[CAD_data_diversions['Cahoots_related'] == 1]['InitialIncidentTypeDescription'].value_counts()
police_calls = CAD_data_diversions[CAD_data_diversions['Cahoots_related'] == 0]['InitialIncidentTypeDescription'].value_counts()

# Calculate proportions of Cahoots and police involvement in CAD_data_diversions
cahoots_proportion = cahoots_calls / total_calls
police_proportion = police_calls / total_calls

# Apply scaling
scaled_cahoots_calls = np.log(cahoots_calls)
scaled_police_calls = np.log(police_calls)

# Calculate harmonic mean of the proportions
harmonic_mean_proportion = 2 * (cahoots_proportion * police_proportion) / (cahoots_proportion + police_proportion + 1e-10)  # avoid division by zero

# Composite score based on harmonic mean and call counts
composite_scores = harmonic_mean_proportion * ((scaled_cahoots_calls + scaled_police_calls))

# Apply z-score normalization to the composite scores
normalized_composite_scores = zscore(composite_scores)

# normalized composite scorethreshold
substantial_incident_types = composite_scores[normalized_composite_scores > 1.5].index

# Filter data
filtered_CAD_data_diversions = CAD_data_diversions[CAD_data_diversions['InitialIncidentTypeDescription'].isin(substantial_incident_types)]

#filtered_CAD_data_diversions

In [161]:
# Save File 
output_path = output_dir / "cleaned_CAD_diversions.csv"
filtered_CAD_data_diversions.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to data\cleaned_data\cleaned_CAD_diversions.csv


# Police Replication Dataset

In [162]:
import pandas as pd
CAD_data = pd.read_csv("data/call_data_from_CAD.csv")

### Convert to DT

In [163]:
CAD_data["Call_Created_Time"] = pd.to_datetime(CAD_data['Call_Created_Time'], errors='coerce')
CAD_data["year"] = CAD_data["Call_Created_Time"].dt.year

In [164]:
# Standardize Cahoots identifiers 
cahoots_identifiers = r"1J77\s*|3J79\s*|3J78\s*|3J77\s*|4J79\s*|3J81\s*|3J76\s*|2J28\s*|2J29\s*|CAHOOT\s*|CAHOT\s*|CAHO\s*"

CAD_data["PrimaryUnitCallSign"] = CAD_data["PrimaryUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)
CAD_data["RespondingUnitCallSign"] = CAD_data["RespondingUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)
# Standardize Cahoots identifiers 
cahoots_identifiers = r"1J77\s*|3J79\s*|3J78\s*|3J77\s*|4J79\s*|3J81\s*|3J76\s*|2J28\s*|2J29\s*|CAHOOT\s*|CAHOT\s*|CAHO\s*"

CAD_data["PrimaryUnitCallSign"] = CAD_data["PrimaryUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)
CAD_data["RespondingUnitCallSign"] = CAD_data["RespondingUnitCallSign"].replace(cahoots_identifiers, 'CAHOOT', regex=True)

# Create an identifier for Cahoots involvement 
CAD_data['Cahoots_related'] = ((CAD_data['PrimaryUnitCallSign'] == 'CAHOOT') | (CAD_data['RespondingUnitCallSign'] == 'CAHOOT')).astype(int)

In [165]:
data_cleaned = CAD_data.drop(columns=['Unnamed: 0'])
data_cleaned = data_cleaned.sort_values(by="Call_Created_Time")

In [166]:
CAD_2021 = data_cleaned[data_cleaned["year"] == 2021].copy()

In [167]:
from pathlib import Path
output_dir = Path("data/cleaned_data")
output_dir.mkdir(parents=True, exist_ok=True)

# Save the cleaned DataFrame 
output_path = output_dir / "cleaned_CAD_data_diversions.csv"
data_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to data\cleaned_data\cleaned_CAD_data_diversions.csv


In [168]:
output_dir = Path("data/cleaned_data")
output_dir.mkdir(parents=True, exist_ok=True)

# Save the cleaned DataFrame 
output_path = output_dir / "cleaned_CAD_data_2021_diversions.csv"
CAD_2021.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to data\cleaned_data\cleaned_CAD_data_2021_diversions.csv
