# Notebook for updating police incidents daily

In [5]:
import pandas as pd
import requests
import requests_cache
import time
from io import StringIO
from datetime import datetime, timedelta
import json

pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_colwidth", None)

## Getting all crime data from the previous day

#### Right now I haven't handled duplicates. Simple fix later, once we decide how to

- request only the previous day's crimes. 
- update according to the date
- concat to the existing df, make sure there aren't any duplicates

In [53]:
# load our master dataset of crimes
crimes_df = pd.read_csv("crimes_updated.csv")
crimes_df.head(2)

Unnamed: 0,row_id,incident_datetime,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,incident_id,incident_number,cad_number,report_type_code,report_type_description,filed_online,incident_code,incident_category,incident_subcategory,incident_description,resolution,intersection,cnn,police_district,analysis_neighborhood,supervisor_district,supervisor_district_2012,latitude,longitude,point,data_as_of,data_loaded_at
0,150744327170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,27170,Other Miscellaneous,Other,"Resisting, Delaying, or Obstructing Peace Officer Duties",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
1,150728309320,2025-08-25T08:40:00.000,2025-08-25T00:00:00.000,08:40,2025,Monday,2025-08-26T10:03:00.000,1507283,250478015,252381142.0,II,Initial,,9320,Fraud,Fraud,"Access Card, incl. Credit, Phone, ATM, Fraudulent Use of",Open or Active,MARKET ST \ SOUTH VAN NESS AVE \ VAN NESS AVE,30748000,Southern,Mission,6,6,37.775146,-122.419258,POINT (-122.419258118 37.775146484),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000


In [54]:
yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

# debugging for a couple weeks, can remove later
print(f"Fetching data for: {yesterday}")

Fetching data for: 2025-11-18


In [55]:
url = "https://data.sfgov.org/resource/wg3w-h783.csv"

In [56]:
# parameters for the url. Easier if we filter in the request itself because of the sheer volume of reports. 
# filters with date range for yesterday

filters = {
    'analysis_neighborhood': 'Mission',
    'report_type_description': 'Initial',
    '$where': f"incident_date >= '{yesterday}T00:00:00' AND incident_date < '{yesterday}T23:59:59'"
}

In [57]:
response = requests.get(url, params=filters)

In [58]:
if response.status_code == 200:
    
    new_data = pd.read_csv(StringIO(response.text))
    print("New records fetched:", len(new_data))

else:
    print(f"Error: {response.status_code}")

New records fetched: 30


In [60]:
# concat
crimes_df = pd.concat([crimes_df, new_data], ignore_index=True)

# remove duplicates
# crimes_df = crimes_df.drop_duplicates(subset=['incident_id'], keep='first')

In [None]:
# Save updated data
crimes_df.to_csv('crimes_updated.csv', index=False)
crimes_df.head()

##### @Kelly: Thoughts on how we should deal with similar incident numbers? See below for one incident being cited for multiple violations.

In [41]:
crimes_df[crimes_df["incident_id"]== 1507443]

Unnamed: 0,row_id,incident_datetime,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,incident_id,incident_number,cad_number,report_type_code,report_type_description,filed_online,incident_code,incident_category,incident_subcategory,incident_description,resolution,intersection,cnn,police_district,analysis_neighborhood,supervisor_district,supervisor_district_2012,latitude,longitude,point,data_as_of,data_loaded_at
0,150744327170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,27170,Other Miscellaneous,Other,"Resisting, Delaying, or Obstructing Peace Officer Duties",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
32,150744363010,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,63010,Warrant,Other,"Warrant Arrest, Local SF Warrant",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
38,150744326170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,26170,Other Miscellaneous,Other,Probation Violation,Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
2200,150744327170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,27170,Other Miscellaneous,Other,"Resisting, Delaying, or Obstructing Peace Officer Duties",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
2230,150744363010,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,63010,Warrant,Other,"Warrant Arrest, Local SF Warrant",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
2236,150744326170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,26170,Other Miscellaneous,Other,Probation Violation,Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000


## Add code for restricting geography

In [3]:
crimes_df = pd.read_csv("crimes_updated.csv")
crimes_df.head(2)

Unnamed: 0,row_id,incident_datetime,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,incident_id,incident_number,cad_number,report_type_code,report_type_description,filed_online,incident_code,incident_category,incident_subcategory,incident_description,resolution,intersection,cnn,police_district,analysis_neighborhood,supervisor_district,supervisor_district_2012,latitude,longitude,point,data_as_of,data_loaded_at
0,150744327170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,27170,Other Miscellaneous,Other,"Resisting, Delaying, or Obstructing Peace Officer Duties",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
1,150728309320,2025-08-25T08:40:00.000,2025-08-25T00:00:00.000,08:40,2025,Monday,2025-08-26T10:03:00.000,1507283,250478015,252381142.0,II,Initial,,9320,Fraud,Fraud,"Access Card, incl. Credit, Phone, ATM, Fraudulent Use of",Open or Active,MARKET ST \ SOUTH VAN NESS AVE \ VAN NESS AVE,30748000,Southern,Mission,6,6,37.775146,-122.419258,POINT (-122.419258118 37.775146484),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000


## Calendar Map

In [9]:
pwd

'/Users/anushasubramanian/Desktop/Mission_Local/2025-graphics/docs/16th-dashboard'

In [12]:
# Convert incident_date to datetime
crimes_df['incident_date'] = pd.to_datetime(crimes_df['incident_date'])

# Filter to last 6 months
from datetime import datetime, timedelta
six_months_ago = datetime.now() - timedelta(days=180)
crimes_df = crimes_df[crimes_df['incident_date'] >= six_months_ago]

# Replace NaN in incident_category with "Unknown"
crimes_df['incident_category'] = crimes_df['incident_category'].fillna('Unknown')

# Group by date and get crime categories for each day
calendar_export = crimes_df.groupby('incident_date').agg({
    'incident_category': list,
}).reset_index()

# Add count of incidents
calendar_export['count'] = calendar_export['incident_category'].apply(len)

calendar_export['incident_date'] = calendar_export['incident_date'].dt.strftime('%Y-%m-%d')

calendar_export = calendar_export.rename(columns={
    'incident_date': 'date',
    'incident_category': 'types'
})

# Reorder columns
calendar_export = calendar_export[['date', 'count', 'types']]

calendar_data = calendar_export.to_dict('records')

# Save as json
with open('calendar_map/calendar_data.json', 'w') as f:
    json.dump(calendar_data, f, indent=2)

## Getting the master list of current data - 69k records since 2018

Date of last run: Until Nov.18, 2025

- Only for Mission district
- Only initial reports, not copologic

We only run this **once** to get all the data till today to create our master list that we will then update. 


@Kelly: do you want to ignore incidents reported that were ongoing and only consider the ones resulting in arrests and citation?

In [37]:
# # Same concept as above 

# url = "https://data.sfgov.org/resource/wg3w-h783.csv"

# all_data = []
# offset = 0
# limit = 1000  # this is the max you get if you don't paginate

# # parameters for the url. Easier if we filter in the request itself because of the sheer volume of reports. 
# filters = {
#     '$limit': limit,
#     '$offset': offset,
#     'analysis_neighborhood': 'Mission',  
#     'report_type_description': 'Initial'  
# }

# while True:
#     # Update offset for pagination
#     filters['$offset'] = offset
    
#     # Make request
#     response = requests.get(url, params=filters)
    
#     if response.status_code != 200:
#         print(f"Error: {response.status_code}")
#         break
    
#     # parse CSV
#     chunk_df = pd.read_csv(StringIO(response.text))
    
#     # check if we got any data. If we got nothing, we've reached the end of the list
#     if len(chunk_df) == 0:
#         break
    
#     # creating a list of dfs to concat at the end
#     all_data.append(chunk_df)
    
#     # if we got fewer records than the limit, we've reached the end
#     if len(chunk_df) < limit:
#         break
    
#     # update offset and wait so we don't timeout accidentally
#     offset += limit
#     time.sleep(0.5)

# # concat
# crimes_df = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()

# print(f"Total records fetched: {len(crimes_df)}")
    
# crimes_df.head()

Total records fetched: 68980


Unnamed: 0,row_id,incident_datetime,incident_date,incident_time,incident_year,incident_day_of_week,report_datetime,incident_id,incident_number,cad_number,report_type_code,report_type_description,filed_online,incident_code,incident_category,incident_subcategory,incident_description,resolution,intersection,cnn,police_district,analysis_neighborhood,supervisor_district,supervisor_district_2012,latitude,longitude,point,data_as_of,data_loaded_at
0,150744327170,2025-08-26T11:22:00.000,2025-08-26T00:00:00.000,11:22,2025,Tuesday,2025-08-26T11:22:00.000,1507443,250478275,252381401.0,II,Initial,,27170,Other Miscellaneous,Other,"Resisting, Delaying, or Obstructing Peace Officer Duties",Cite or Arrest Adult,16TH ST \ WIESE ST,24180000,Mission,Mission,9,9,37.765003,-122.420479,POINT (-122.420478821 37.765003204),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
1,150728309320,2025-08-25T08:40:00.000,2025-08-25T00:00:00.000,08:40,2025,Monday,2025-08-26T10:03:00.000,1507283,250478015,252381142.0,II,Initial,,9320,Fraud,Fraud,"Access Card, incl. Credit, Phone, ATM, Fraudulent Use of",Open or Active,MARKET ST \ SOUTH VAN NESS AVE \ VAN NESS AVE,30748000,Southern,Mission,6,6,37.775146,-122.419258,POINT (-122.419258118 37.775146484),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
2,150744562050,2025-08-26T16:16:00.000,2025-08-26T00:00:00.000,16:16,2025,Tuesday,2025-08-26T16:16:00.000,1507445,250479013,252382436.0,II,Initial,,62050,Warrant,Warrant,"Warrant Arrest, Enroute To Outside Jurisdiction",Cite or Arrest Adult,17TH ST \ VALENCIA ST,24174000,Mission,Mission,9,9,37.763298,-122.42173,POINT (-122.421730042 37.763298035),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
3,150742216710,2025-08-26T12:49:00.000,2025-08-26T00:00:00.000,12:49,2025,Tuesday,2025-08-26T12:49:00.000,1507422,250478504,252381724.0,II,Initial,,16710,Drug Offense,Drug Violation,"Narcotics Paraphernalia, Possession of",Cite or Arrest Adult,15TH ST \ MISSION ST,24171000,Mission,Mission,9,9,37.766674,-122.419823,POINT (-122.419822693 37.766674042),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000
4,150621016710,2025-08-21T13:35:00.000,2025-08-21T00:00:00.000,13:35,2025,Thursday,2025-08-21T13:35:00.000,1506210,250468113,252332081.0,II,Initial,,16710,Drug Offense,Drug Violation,"Narcotics Paraphernalia, Possession of",Cite or Arrest Adult,14TH ST \ JULIAN AVE,24373000,Mission,Mission,9,9,37.7682,-122.421211,POINT (-122.421211243 37.768199921),2025-08-27T09:38:07.000,2025-08-28T09:53:00.000


In [38]:
crimes_df.to_csv("crimes_updated.csv", index = False)