<a href="https://colab.research.google.com/github/HongSenDu/citizen-scrape/blob/main/citizen_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
import requests
from datetime import datetime
from pprint import pprint
from collections import defaultdict
import pandas as pd
import numpy as np

In [2]:
citizen_site = "https://citizen.com/"
incident_endpoint = "https://citizen.com/api/incident/"
search_route = "search"
location_query_params = "?insideBoundingBox[0]={}&insideBoundingBox[2]={}&insideBoundingBox[1]={}&insideBoundingBox[3]={}&limit={}"
incident_selected_fields = ['title', 'address', 'neighborhood', 'location', 'cityCode', 'categories', 'updates', 'key', 'police', 'shareImageLive', 'stats', 'hasVod', 'severity',  'cs']
interested_categories = ['Animal Related', 'Assault / Fight', 'Break In', 'Fire', 'Fire / EMS Activity', 'Gun Related', 'Harassment', 'Protest', 'Pursuit / Search', 'Robbery / Theft ', 'Weapon']
groupable_fields = ['cityCode', 'neighborhood', 'police', 'categories', 'hasVod']

# Scrape Functions


In [3]:
# takes tuple of bounding lat long coordinates
def get_data_by_location(latitude, longitude, limit = 10):
  endpoint = incident_endpoint + search_route + location_query_params.format(latitude[0], latitude[1], longitude[0], longitude[1], limit)
  request = requests.get(endpoint)
  if request.status_code == 200:
    return request.json().get("hits", [])
  return f"Error: {request.status_code}"

# only keep fields that are relevant
def filter_by_keys(data, selected_fields):
  return { key: data.get(key, None) for key in selected_fields }

def endpoint_data(endpoint):
  request = requests.get(endpoint, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'})
  if request.status_code == 200:
    return request.json()
  print(f"Error: {request.status_code}")
  return None

# most accurate when using city_codes
def get_data_by_keyword(term, limit = 10):
  endpoint = incident_endpoint + search_route + f"?q={term}&limit={limit}"
  data = endpoint_data(endpoint)
  return data.get("hits", data) if data else []

# get single incident information with option to sort by relevant fields
def get_incident_data(id, selected_fields = False):
  endpoint = incident_endpoint + f"{id}/stats"
  data = endpoint_data(endpoint)
  if data:
    if selected_fields:
      data = filter_by_keys(data, selected_fields)
    if data['categories']:
      data['categories'] =  ','.join(e for e in data['categories'])
    data["url"] = citizen_site + f"{id}"
  return data or []

# function to get all data from citizen
def get_all_data():
  endpoint = incident_endpoint + search_route + "?limit=10000000000"
  data = endpoint_data(endpoint)
  return data.get("hits", data) or []

# group data by category of incident
def group_by_category(data):
  categories = defaultdict(dict)
  for incident in data:
    category = ','.join(e for e in incident['categories'])
    if category in interested_categories:
      cat_loc = categories[category][incident['objectID']] = {}
      cat_loc['title'] = incident['title']
      cat_loc["updates"] = [updates['text'] for updates in incident['updates']]
  return categories

# saves to local csv file from scraped data
def write_to_csv(file, data):
  keys = data[0].keys()
  with open(file, 'w', newline='') as output_file:
      dict_writer = csv.DictWriter(output_file, keys)
      dict_writer.writeheader()
      dict_writer.writerows(data) 

# sort by relevant incident fields
def get_incident_fields(incidents):
  incidents_arr = []
  for incident in incidents:
    if incident['severity'] == 'red':
      incident_data = get_incident_data(incident['objectID'], incident_selected_fields)
      if isinstance(incident_data, dict):
        incident_data.update({
            "created_at": datetime.fromtimestamp(incident_data.get('created_at', incident_data['cs']) / 1000).strftime("%m/%d/%Y, %H:%M:%S"),
            "updated_at": datetime.fromtimestamp(incident_data.get('updated_at', incident_data['cs']) / 1000).strftime("%m/%d/%Y, %H:%M:%S"),
            "categories": ''.join(incident_data['categories']) if incident_data['categories'] else None,
            "usersNotified": incident_data['stats'].get('usersNotifiedUnique', 0),
            "stats": ', '.join([f'{key}:{value}' for key, value in incident_data['stats'].items()])
        })
      incidents_arr.append(incident_data)
  return incidents_arr

# get updates array of dictionaries for incidents
def get_only_updates(incident):
  updates = incident['updates']
  updates_arr = [{"id": key, 
                  "created_at": datetime.fromtimestamp(val["ts"] / 1000).strftime("%m/%d/%Y, %H:%M:%S"),
                  "text": val["text"],
                  "location": val["displayLocation"],
                 } for (key, val) in list(updates.items()) if isinstance(val, dict)]
  return updates_arr

# Scraping by NYC


In [4]:
# get nyc incidents by keyword
nyc_incidents = get_data_by_keyword("nyc", limit = 10000)

# sort by relevant fields
incidents_arr = get_incident_fields(nyc_incidents)

# sort by number of people broadcasted to
incidents_arr = sorted(incidents_arr, key=lambda d: d['usersNotified'], reverse=True)

In [5]:
write_to_csv('nyc_data.csv', incidents_arr)

In [6]:
# get dataframe from scraped data
df_nyc = pd.DataFrame.from_dict(incidents_arr)

# Display

In [7]:
from ipywidgets import widgets, Layout, Box, VBox
from IPython.display import display, Javascript, HTML
from google.colab import data_table, output

# enables powerful colab dataframe sort buttons
data_table.enable_dataframe_formatter()

In [8]:
table_output = widgets.Output()
info_output = widgets.Output()
updates_output = widgets.Output()
similar_output = widgets.Output()

# Formatters

In [9]:
dataframe_html_string = '''
  <head>
    <style> table, th, td {{border:1px solid black !important;}} </style>
  </head>
  <body>
    {table}
  </body>
  <script> 
    function get_info(selected_key) {{ google.colab.kernel.invokeFunction('notebook.show_incident_data', [], {{selected_key}}); }}
    var rows = Array.from(document.getElementsByTagName('tr'));
    // remove header row
    rows.shift();
    rows.forEach((row, i) => {{
      row.addEventListener('click', () => {{
        get_info(row.getElementsByTagName('td')[6].innerHTML);
      }});
    }}); 
  </script>
'''

ALL = 'ALL'

In [10]:
# makes links hyperlinks
def make_clickable(val):
    return f'<a target="_blank" href="{val}">{val}</a>'

# converts image URLS to img path formats
def path_to_image_html(path):
    return '<img src="'+ path + '" style=max-height:124px;"/>'

# grab similar incidents to current incident
def get_similar_incidents(current_incident):
  city = current_incident['cityCode']
  title = current_incident['title']
  
  return get_incident_fields(get_data_by_keyword(f'{title}', limit = 5) )

# writes to output display after clicking on an incident to explore
def show_incident_data(selected_key):
  info_output.clear_output();
  updates_output.clear_output();
  similar_output.clear_output();

  incident = get_incident_fields([{'objectID': selected_key, 'severity': 'red'}])[0]

  df = pd.DataFrame(list(incident.items()), columns = ['Fields','Values'])
  with info_output:
    # display list to citizen incident endpoint
    display(HTML(f"Citizen incident link: <a href={citizen_site + selected_key}>{incident['title']}</a>"))
    display(df)

  updates_df = pd.DataFrame(get_only_updates(incident))
  with updates_output:
    display(updates_df)

  similar_incidents = pd.DataFrame(get_similar_incidents(incident)).drop('updates', axis=1)
  with similar_output:
      # displays in clickable table form
      display(HTML(dataframe_html_string.format(table=similar_incidents.to_html(escape=False,
                                                justify='center',
                                                formatters=dict(shareImageLive=path_to_image_html, url=make_clickable), border=1))))

output.register_callback('notebook.show_incident_data', show_incident_data)


# Data Handling and Display

In [11]:
df_nyc = df_nyc.drop('updates', axis=1)
widget_df = df_nyc

In [12]:
def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique

# handler for category widget
def category_event_handler(change):
  common_filterer(change.new, dropdown_neighborhood.value)

# handler for neighborhood widget
def neighborhood_event_handler(change):
  common_filterer(dropdown_categories.value, change.new)

# handler for type of display widget. Supports colab dataframe and HTML rendering
def display_event_handler(change):
  common_filterer(dropdown_categories.value, dropdown_neighborhood.value)

# filters based on category and neighborhood. Displays based on option selected
def common_filterer(category, neighborhood):
  table_output.clear_output()
  if (category == ALL) and (neighborhood == ALL):
    widget_df = df_nyc
  elif category == ALL:
    widget_df = df_nyc[df_nyc.neighborhood == neighborhood]
  elif neighborhood == ALL:
    widget_df = df_nyc[df_nyc.categories == category]
  else:
    widget_df = df_nyc[(df_nyc.categories == category) & (df_nyc.neighborhood == neighborhood)]
  with table_output:
    if dropdown_display.value == "Groupable Data Table":
      display(widget_df)
    else:
      display(HTML(dataframe_html_string.format(table=widget_df.to_html(escape=False,
                                     justify='center',
                                     formatters=dict(shareImageLive=path_to_image_html, url=make_clickable), border=1))))


In [13]:
dropdown_categories = widgets.Dropdown(
    options=unique_sorted_values_plus_ALL(df_nyc.categories),
    description='Category',
    disabled=False,
)
dropdown_categories.observe(category_event_handler, names='value')

dropdown_neighborhood = widgets.Dropdown(
    options=unique_sorted_values_plus_ALL(df_nyc.neighborhood),
    description='Neighborhood',
    disabled=False,
)
dropdown_neighborhood.observe(neighborhood_event_handler, names='value')

dropdown_display = widgets.Dropdown(
    options=["Groupable Data Table", "HTML Rendering"],
    description='Display Option',
    disabled=False,
)
dropdown_display.observe(display_event_handler, names='value')

# Interactables

In [18]:
box_layout = Layout(display='flex',
                    flex_flow='row',
                    justify_content='center',
                    align_items='stretch',
                    overflow='visible',
                    width='auto')
box_auto = Box(children=[dropdown_categories, dropdown_neighborhood, dropdown_display], layout=box_layout)
display(VBox([box_auto]))
display(table_output)

VBox(children=(Box(children=(Dropdown(description='Category', options=('ALL', 'Barricade', 'Fire', 'Gun Relate…

Output()

#Output

In [15]:
display(info_output)

Output()

In [16]:
display(updates_output)

Output()

In [17]:
display(similar_output)

Output()