In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("Arrest_Data_from_2010_to_2019.csv")
data.head()

Unnamed: 0,Report ID,Arrest Date,Time,Area ID,Area Name,Reporting District,Age,Sex Code,Descent Code,Charge Group Code,Charge Group Description,Arrest Type Code,Charge,Charge Description,Address,Cross Street,Location
0,191811472,05/03/2019,1700.0,18,Southeast,1802,23,F,B,,,M,653.22 PC,,91ST,FIGUEROA,"(33.9543, -118.2827)"
1,5614161,04/29/2019,1040.0,8,West LA,842,41,M,H,3.0,Robbery,F,211PC,ROBBERY,11600 WILSHIRE BL,,"(34.0508, -118.4592)"
2,5615197,04/30/2019,615.0,6,Hollywood,663,27,M,O,5.0,Burglary,F,459PC,BURGLARY,LA BREA,LEXINGTON,"(34.0907, -118.3384)"
3,5615701,04/30/2019,1100.0,9,Van Nuys,901,2,F,H,,,D,300(B)WIC,,RAYMER,SEPULVEDA BL,"(34.2149, -118.4674)"
4,5615929,04/30/2019,1550.0,20,Olympic,2049,41,M,W,4.0,Aggravated Assault,F,245(A)(2)PC,ADW W/FIREARM,8TH ST,VERMONT,"(34.0578, -118.2916)"


Each row in dataframe represents the booking of an arrestee.

In [4]:
nrows = len(data)
print("Number of rows:", nrows)

Number of rows: 1310127


To find the bookings of arrestees in 2018: 

In [5]:
data['Arrest Date'] = pd.to_datetime(data['Arrest Date'], format='%m/%d/%Y')
data_2018 = data[data['Arrest Date'].dt.year == 2018]
data_2018

Unnamed: 0,Report ID,Arrest Date,Time,Area ID,Area Name,Reporting District,Age,Sex Code,Descent Code,Charge Group Code,Charge Group Description,Arrest Type Code,Charge,Charge Description,Address,Cross Street,Location
156,5213460,2018-01-24,1930.0,6,Hollywood,668,29,M,H,10.0,Fraud/Embezzlement,F,530.5(A)PC,GET CREDIT/ETC OTHER'S ID,ST ANDREWS,FOUNTAIN,"(34.0949, -118.3109)"
157,5257944,2018-03-15,310.0,7,Wilshire,743,33,F,O,1.0,Homicide,F,191.5(B)PC,VEH MANSLAUGHTR/OPER VESSEL DUI & DUR FEL,OLYMPIC,MASSELIN,"(34.0574, -118.3547)"
158,5328813,2018-06-01,1800.0,3,Southwest,356,44,F,B,16.0,Narcotic Drug Laws,F,11379(A)HS,TRANSPORT/SELL CONTROLLED SUBSTANCE,1600 W 36TH PL,,"(34.0219, -118.3061)"
159,5336331,2018-06-10,200.0,13,Newton,1364,23,M,H,12.0,Weapon (carry/poss),F,25400(A)2PC,CARRYING CONCEALED WEAPON UPON PERSON,1200 E 50TH ST,,"(33.998, -118.2543)"
160,5360731,2018-07-07,2330.0,18,Southeast,1832,32,M,B,12.0,Weapon (carry/poss),F,29800(A)1PC,POSS F/ARM BY CONVICTED FELON/ADDICT/ETC,10700 S MAIN ST,,"(33.9392, -118.2739)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290574,5509442,2018-12-31,1400.0,19,Mission,1963,36,M,H,8.0,Other Assaults,F,422(A)PC,TERRORIZE CAUSING FEAR,14700 PLUMMER ST,,"(34.2454, -118.4505)"
1290583,5509487,2018-12-31,1545.0,1,Central,155,55,M,B,8.0,Other Assaults,F,422(A)PC,TERRORIZE CAUSING FEAR,5TH ST,MAPLE,"(34.0454, -118.2466)"
1290588,5509344,2018-12-31,720.0,11,Northeast,1133,25,M,H,6.0,Larceny,M,490.2PC,PETTY THEFT,3100 GLENDALE BL,,"(34.1119, -118.2529)"
1290595,5509947,2018-12-31,2320.0,4,Hollenbeck,408,23,M,H,16.0,Narcotic Drug Laws,F,11378HS,POSSESSION CONTROLLED SUBSTANCE FOR SALE,HUNTINGTON,ROSEMEAD,"(34.0872, -118.176)"


In [6]:
nrows_2018 = len(data_2018)
print("Number of rows corresponding to 2018 bookings:", nrows_2018)

Number of rows corresponding to 2018 bookings: 104277


In [14]:
area_arrest_counts = data_2018.groupby('Area Name').size().reset_index(name='Arrest Count')
most_arrests_area = area_arrest_counts.loc[area_arrest_counts['Arrest Count'].idxmax()]

most_arrests_area_data = data_2018[data_2018['Area Name'] == most_arrests_area['Area Name']]
most_arrests_count = len(most_arrests_area_data)

print(f"The area with most arrests in 2018 is {most_arrests_area['Area Name']}.")
print(f"Number of arrests: {most_arrests_count}")

The area with most arrests in 2018 is Central.
Number of arrests: 10951


In [9]:
charge_groups = ['Vehicle Theft', 'Robbery', 'Burglary', 'Receive Stolen Property']
filtered_data = data[(data['Arrest Date'].dt.year == 2018) & (data['Charge Group Description'].isin(charge_groups))]
quantile_95 = filtered_data['Age'].quantile(0.95)

print("95% quantlie of age of arrestee in 2018:", quantile_95)

95% quantlie of age of arrestee in 2018: 52.0


In [19]:
minor_groups = ['Pre-Delinquency', 'Non-Criminal Detention']
filtered_data = data[
    (data['Arrest Date'].dt.year == 2018) &
    (~data['Charge Group Description'].isin(minor_groups)) &
    (data['Charge Group Description'].notnull())
]

mean_age = filtered_data['Age'].mean()
std_age = filtered_data['Age'].std()
grouped_age = filtered_data.groupby('Charge Group Description')['Age'].mean()

z_scores = (grouped_age - mean_age) / std_age
max_z = z_scores.abs().max()

print("Largest absolute Z-score:", max_z)

Largest absolute Z-score: 0.6934284004664495


In [18]:
bradbury_coords = (34.050536, -118.247861)
radius = 6371

filtered_data = data[
    (data['Arrest Date'].dt.year == 2018) &
    (data['Location'] != (0, 0))
]

def calculate_distance(lat1, lon1, lat2, lon2):
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_lambda = np.radians(lon2-lon1)
    distance = radius * np.arccos(np.sin(phi1)*np.sin(phi2)+np.cos(phi1)*np.cos(phi2)*np.cos(delta_lambda))
    return distance

count_2km = 0
for index, row in filtered_data.iterrows():
    arrest_coords = eval(row['Location'])
    distance = calculate_distance(bradbury_coords[0], bradbury_coords[1], arrest_coords[0], arrest_coords[1])
    if distance <= 2:
        count_2km += 1

print("Number of arrest incidents within 2 km of the Bradbury Building in 2018:", count_2km)

Number of arrest incidents within 2 km of the Bradbury Building in 2018: 11461


In [17]:
import ast
pico_data = data[
    (data['Arrest Date'].dt.year == 2018) & 
    (data['Address'].str.contains("Pico", case=False))
].copy()

pico_data['Coordinates'] = pico_data['Location'].apply(ast.literal_eval)
pico_data.loc[:, 'Latitude'] = pico_data['Coordinates'].apply(lambda x: float(x[0]))
pico_data.loc[:, 'Longitude'] = pico_data['Coordinates'].apply(lambda x: float(x[1]))

lat_mean = pico_data['Latitude'].mean()
lat_std = pico_data['Latitude'].std()
lon_mean = pico_data['Longitude'].mean()
lon_std = pico_data['Longitude'].std()

filtered_pico_data = pico_data[
    (pico_data['Latitude'] >= lat_mean - 2*lat_std) & 
    (pico_data['Latitude'] <= lat_mean + 2*lat_std) &
    (pico_data['Longitude'] >= lon_mean - 2*lon_std) & 
    (pico_data['Longitude'] <= lon_mean + 2*lon_std)
]

west = filtered_pico_data['Longitude'].min()
east = filtered_pico_data['Longitude'].max()
length_pico = abs(west - east)*111.32  

num_incidents = len(filtered_pico_data)
incidents_per_km = num_incidents / length_pico if length_pico > 0 else 0

print("Number of arrest incidents per kilometer on Pico Boulevard in 2018:", incidents_per_km)


Number of arrest incidents per kilometer on Pico Boulevard in 2018: 22.921900500614495


In [20]:
filtered_data = data[
    (data['Arrest Date'] < '2019-01-01') &
    (data['Charge Group Code'].notnull()) &
    (data['Charge Group Code'] != 99)
]

total_arrests = len(filtered_data)
charge_counts = filtered_data['Charge Group Code'].value_counts()

area_charge_counts = filtered_data.groupby(['Area ID', 'Charge Group Code']).size().reset_index(name='Area Count')
area_totals = filtered_data.groupby('Area ID').size().reset_index(name='Total Area Count')
area_data = pd.merge(area_charge_counts, area_totals, on='Area ID')

area_data['City Probability'] = area_data['Charge Group Code'].map(charge_counts/total_arrests)
area_data['Area Probability'] = area_data['Area Count'] / area_data['Total Area Count']
area_data['Ratio'] = area_data['Area Probability'] / area_data['City Probability']

top_ratios = area_data.nlargest(5, 'Ratio')
avg_top_ratios = top_ratios['Ratio'].mean()

print("Average:", avg_top_ratios)

Average: 3.515076379865202


In [21]:
from sklearn.linear_model import LinearRegression

felony_data = data[
    (data['Arrest Date'].dt.year >= 2010) &
    (data['Arrest Date'].dt.year <= 2018) &
    (data['Arrest Type Code'] == 'F') 
]
felony_counts = felony_data.groupby(felony_data['Arrest Date'].dt.year).size().reset_index(name='Count')

X = felony_counts['Arrest Date'].values.reshape(-1, 1) 
y = felony_counts['Count'].values 
model = LinearRegression()
model.fit(X, y)

predicted_2019 = model.predict(np.array([[2019]]))
print("Projected number:", predicted_2019)

Projected number: [31037.80555556]


Rounded to the nearest integer results in 31038
