In [14]:
import pandas as pd
import json
import requests
from pprint import pprint

In [37]:
url = "https://public.opendatasoft.com/api/records/1.0/search/?dataset=global-shark-attack&q=&rows=1000&facet=date&facet=type&facet=country&facet=area&facet=activity&facet=sex&facet=age&facet=fatal_y_n&facet=time&facet=species"

response = requests.get(url)    # Will get a HTTP Status code (response object)

response_json = response.json()

In [38]:
# Just want to pull records 
records = response_json['records']

# Look at first record
pprint(len(records))

1000


In [39]:
# Need to make a list of the field dictionaries
records_fields = [x["fields"] for x in records]

# Check list to make sure correct info was collected
records_fields[0:1]

[{'activity': 'Surfing',
  'location': 'Centerville Beach',
  'sex': 'M',
  'year': '2022',
  'injury': 'Thigh injured',
  'case_number': '2022.010.02',
  'time': '15h30',
  'name': 'Jared Trainor',
  'investigator_or_source': 'Outsider, 10/3/2022',
  'age': '31',
  'species': "White shark, 13'",
  'date': '2022-10-02',
  'type': 'Unprovoked',
  'area': 'California',
  'country': 'USA',
  'fatal_y_n': 'N'}]

In [40]:
# Make a dataframe
df = pd.DataFrame(records_fields)
df.head(5)

Unnamed: 0,activity,location,sex,year,injury,case_number,time,name,investigator_or_source,age,...,type,area,country,fatal_y_n,href_formula,original_order,case_number1,href,pdf,case_number0
0,Surfing,Centerville Beach,M,2022,Thigh injured,2022.010.02,15h30,Jared Trainor,"Outsider, 10/3/2022",31.0,...,Unprovoked,California,USA,N,,,,,,
1,Surfing,"Topsail Beach, Pender County",F,2022,Lacerations to lower leg,2022.07.19.a,Early morning,Erika Vreuls,C. Creswell. GSAF,,...,Unprovoked,Noirth Carolina,USA,N,http://sharkattackfile.net/spreadsheets/pdf_di...,6787.0,2022.07.19.a,http://sharkattackfile.net/spreadsheets/pdf_di...,2022.07.19.a-Vreuis.pdf,2022.07.19.a
2,Swimming,"Sawyer Key , Monroe County",F,2022,Laceration to leg,2022.06.29.b,20h00,Lindsay Rebecca Bruns,Miami Herald/ 7/12022,35.0,...,Unprovoked,Florida,USA,N,http://sharkattackfile.net/spreadsheets/pdf_di...,6772.0,2022.06.29.b,http://sharkattackfile.net/spreadsheets/pdf_di...,2022.06.29.b-Bruns.pdf,2022.06.29.b
3,Swimming,Oyster Stacks near Exmouth12h15,F,2022,Minor injury,2022.06.01,12h15,,"Perth Now, 6//2/2022",,...,Unprovoked,Western Australia,AUSTRALIA,N,http://sharkattackfile.net/spreadsheets/pdf_di...,6765.0,2022.06.01,http://sharkattackfile.net/spreadsheets/pdf_di...,2022.06.01-Exmouth.pdf,2022.06.01
4,Swimming,Pororari River,F,2021,Laceration to big toe,2021.12.25,,Cordelia Scott,"New Zealand Herald, 12/31/2021",9.0,...,Unprovoked,South Island,New Zealand,N,http://sharkattackfile.net/spreadsheets/pdf_di...,6726.0,2021.12.25,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.12.25-Scott.pdf,2021.12.25


In [41]:
# Make a cut pd with the fields: ["activity", "location", "sex", "year", "injury", "case_number", "time", "age", "type", "area", "country", "fatal_y_n", "species"]
df_cut = df.loc[:, ["activity", "location", "sex", "year", "injury", "case_number", "time", "age", "type", "area", "country", "fatal_y_n", "species"]]
df_cut.head(5)

Unnamed: 0,activity,location,sex,year,injury,case_number,time,age,type,area,country,fatal_y_n,species
0,Surfing,Centerville Beach,M,2022,Thigh injured,2022.010.02,15h30,31.0,Unprovoked,California,USA,N,"White shark, 13'"
1,Surfing,"Topsail Beach, Pender County",F,2022,Lacerations to lower leg,2022.07.19.a,Early morning,,Unprovoked,Noirth Carolina,USA,N,
2,Swimming,"Sawyer Key , Monroe County",F,2022,Laceration to leg,2022.06.29.b,20h00,35.0,Unprovoked,Florida,USA,N,
3,Swimming,Oyster Stacks near Exmouth12h15,F,2022,Minor injury,2022.06.01,12h15,,Unprovoked,Western Australia,AUSTRALIA,N,
4,Swimming,Pororari River,F,2021,Laceration to big toe,2021.12.25,,9.0,Unprovoked,South Island,New Zealand,N,


In [62]:
df_sex = df_cut.groupby(["sex"])["sex"].count()

df_sex


sex
F    114
M    798
Name: sex, dtype: int64

In [63]:
df_cut["species"].unique()

array(["White shark, 13'", nan, '5.5 ft shark', 'White shark',
       'Nurse shark, juvenile', 'Blacktip or Spinner shark',
       'Shovelnose shark which is a ray, not a shark)',
       'Bull shark, 3.5 m', 'Wobbegong shark', 'Tiger shark',
       'White shark, 4.6 m', 'Cookiecutter shark',
       "Bronze whaler shark, 6'", "4' to 5' shark", "6' shark",
       "4' shark", 'Mako shark', 'Tiger shark, 8 to 12 feet', '2 m shark',
       "Bull shark, 4.5'", 'Sandtiger shark', "6' to 8' shark",
       '2m shark', "White shark, >6'", '3 m shark',
       "White shark, 5' to 6' juvenile", "5' to 8' shark",
       "2' to 3' shark", 'Raggedtooth shark',
       'Thought to involve a cookiecutter shark', 'Dusky shark, 2m',
       "5' shark", "3' to 4' shark", "Tiger shark, 14'", 'Nurse shark',
       "1.8 m [6'] bull shark", '2.5 m shark',
       "4 m to 5 m [13' to 16.5'] white shark", 'Questionable Incident',
       'Shark involvement prior to death was not confirmed',
       "3.7 m to 4.3 m [1

In [None]:
# Clean up data by finding key words and replacing the species information 
# For example, any species that has bull in it will just be replaced with "bull shark"
# any species that has "Great" and "white" will be replaced with "Great white"
# Any species with "Tiger" will be replaced with "Tiger"
# Hammerhead, Mako, Spinner, Nurse, Cookiecutter, Blacktip, Raggedtooth, Zambesi, Grey reef shark, dogfish, wobbegong
# If "or" than species will be replaced with "unknown"
#  