## Exploritory Data Analysis of Seattle Pet License data set.

Use the API for the Seattle pet licence open data to get the breed and ZIP code location:

https://data.seattle.gov/Community/Seattle-Pet-Licenses/jguv-t9rb

Standard AKC breed names and characteristics can be found in this data set:

https://www.kaggle.com/datasets/paultimothymooney/best-in-show-data-about-dogs?resource=download

In [107]:
# Requests allows us to make HTTP requests which we will use to get data from an API
import requests
import pandas as pd
import numpy as np
# Datetime is a library that allows us to represent dates
import datetime
import json
import folium
# Import folium MarkerCluster plugin
from folium.plugins import MarkerCluster
# Import folium MousePosition plugin
from folium.plugins import MousePosition
# Import folium DivIcon plugin
from folium.features import DivIcon

import geopandas as gpd
# geopandas uses fiona to read/write files. No need to import fiona

# fuzzywuzzy is a fuzzy string match package
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


In [3]:
# Use API endpoint instead of downloading CSV
url=  'https://data.seattle.gov/resource/jguv-t9rb.json'
App_Token  = 'ooI5TiOD2Gls57XhqffGZykOU'


response = requests.get(url, params={'$$app_token':App_Token, 'species':'Dog', '$order':'license_number'})

In [12]:
response.headers

{'Server': 'nginx', 'Date': 'Sat, 25 Jun 2022 21:06:58 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Access-Control-Allow-Origin': '*', 'ETag': '"YWxwaGEuMTYxMTcyXzJfNDV4Z2RZX0x3eHZuVTBMV2JqWUpUeDZzdnEtYzA---gzirwMp7vjEEoQeyOO4YddFDO-CAp3w--gzip--gzip"', 'X-SODA2-Fields': '["license_issue_date","license_number","animal_s_name","species","primary_breed","secondary_breed","zip_code"]', 'X-SODA2-Types': '["floating_timestamp","text","text","text","text","text","text"]', 'X-SODA2-Data-Out-Of-Date': 'false', 'X-SODA2-Truth-Last-Modified': 'Thu, 21 Apr 2022 19:57:44 GMT', 'X-SODA2-Secondary-Last-Modified': 'Thu, 21 Apr 2022 19:57:44 GMT', 'Last-Modified': 'Thu, 21 Apr 2022 19:57:44 GMT', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Age': '0', 'X-Socrata-Region': 'aws-us-east-1-fedramp-prod', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains', 'X-Socrata-RequestId': '73c533f8f34c01665b3945d32ec45

In [19]:
# Each request returns a limit of 1000 records unless specified in the request parameters
len(response.json())

1000

In [142]:
# Use json_normalize method to convert the json result into a dataframe
license_data = pd.json_normalize(response.json())
license_data.head()

Unnamed: 0,license_issue_date,license_number,animal_s_name,species,primary_breed,secondary_breed,zip_code
0,2020-12-15T00:00:00.000,108440,Pepe,Dog,Havanese,Mix,98125
1,2022-04-07T00:00:00.000,10912,Alfalfa,Dog,Terrier,Mix,98144
2,2021-03-12T00:00:00.000,114061,Lucy,Dog,"Terrier, Cairn",,98112
3,2021-04-19T00:00:00.000,130498,Aster,Dog,"Terrier, Rat",,98112
4,2021-04-21T00:00:00.000,131870,Lexie Lucile,Dog,Border Collie,,98146


In [143]:
# Drop any rows with missing zip_code
license_data.dropna(axis=0,subset=['zip_code'],inplace=True)

# Convert zip_code from Object to int
license_data = license_data.astype({'zip_code': 'int64'})
distinct_zip = license_data['zip_code'].unique()

### Read best_in_show.xlsx

Get standardized AKC breed names, category, size category

In [62]:
path = 'E:/UserLo/source/repos/learning/Seattle Pets/'
file = 'best_in_show.xlsx'

# Load just the columns of interest
col_names = ['Dog breed','category','size category']
breed_data = pd.read_excel(path+file,sheet_name='best_in_show',header = 0, skiprows=[1], usecols=col_names)

In [40]:
breed_data.head()

Unnamed: 0,Dog breed,category,size category
0,Affenpinscher,toy,small
1,Afghan Hound,hound,large
2,Airedale Terrier,terrier,medium
3,Akita,working,large
4,Alaskan Malamute,working,large


Need to stardardize the primary_breed and secondary_breed to match the AKC Dog breed names so we can look up other AKC breed characteristics.

In [None]:
pri_breed = license_data['primary_breed'].unique()
pri_breed

In [58]:
fuzz.token_set_ratio('Fox Wire Terrier','Fox Terrier – Wirehair')

89

In [19]:
AKC_name, match_ratio, breed_data_index = process.extractOne('Spaniel, English Springer', breed_data['Dog breed'], scorer=fuzz.token_set_ratio)
print (AKC_name,' ' , match_ratio, ' ' ,breed_data_index)

English Springer Spaniel   100   18


In [18]:
breed_data.iloc[18]

Dog breed           English Springer Spaniel
category                            sporting
POPULARITY IN US                        29.0
size category                         medium
Name: 18, dtype: object

In [64]:
def get_AKC_name (name,min_match_ratio = 90):
    AKC_name, match_ratio, breed_data_index = process.extractOne(name, breed_data['Dog breed'], scorer=fuzz.token_set_ratio)
    if match_ratio >= min_match_ratio :
        return (AKC_name)
    else:
        return (np.NaN)

In [161]:
# A match ratio of 70 seems to be a good tradeoff between accuracy and missing matches
license_data_AKC = license_data['primary_breed'].apply(get_AKC_name, args=(70,))

In [162]:
# Here are the primary_breed names that could not be matched with AKC names
license_data.iloc[license_data_AKC[license_data_AKC.isna()].index]['primary_breed'].unique()

array(['Retriever, Golden', 'Terrier, Cairn', 'Retriever, Labrador',
       'Chihuahua, Short Coat', 'German Shepherd', 'Maltese',
       'Spaniel, Tibetan', 'Boxer', 'Terrier, Fox, Toy', 'Poodle, Toy',
       'Havanese', 'Pug', 'Terrier, Jack Russell'], dtype=object)

In [163]:
# Add AKC names to license_data. Only a small percentage are not identified with AKC name
license_data['AKC name']= license_data_AKC
print(f"Ratio of dogs not identified with AKC breed name: {license_data[license_data['AKC name'].isna()]['license_number'].count()/license_data['license_number'].count()}")

Ratio of dogs not identified with AKC breed name: 0.015120967741935484


In [164]:
# Drop rows with AKC name == NaN
license_data.dropna(axis=0,subset=['AKC name'],inplace=True)

In [165]:
# Merge license data with breed data to get category info
# Drop redundant column
license_data_category = pd.merge(license_data,breed_data, how='inner', left_on='AKC name', right_on='Dog breed')
license_data_category.drop(columns='Dog breed',inplace=True)
license_data_category.head()

Unnamed: 0,license_issue_date,license_number,animal_s_name,species,primary_breed,secondary_breed,zip_code,AKC name,category,size category
0,2020-12-15T00:00:00.000,108440,Pepe,Dog,Havanese,Mix,98125,Havanese,toy,small
1,2021-03-08T00:00:00.000,132428,Rascal,Dog,Havanese,,98125,Havanese,toy,small
2,2021-04-03T00:00:00.000,137128,Zeus,Dog,Havanese,,98109,Havanese,toy,small
3,2022-04-14T00:00:00.000,138818,Zelda,Dog,Havanese,,98125,Havanese,toy,small
4,2020-11-10T00:00:00.000,139970,Posie,Dog,Havanese,,98144,Havanese,toy,small


### Use geopandas to read the shapefile for ZCTA

In [96]:
shapefile_path = 'E:/UserLo/source/repos/learning/Seattle Pets/'
shapefile = 'tl_2021_us_zcta520.zip!tl_2021_us_zcta520.shp'
zcta = gpd.read_file(shapefile_path+shapefile)

In [169]:
zcta = zcta.astype({'ZCTA5CE20': 'int64'})

In [170]:
Seattle_zcta = zcta[zcta['ZCTA5CE20'].isin(distinct_zip)]

In [171]:
Seattle_zcta.head()

Unnamed: 0,ZCTA5CE20,GEOID20,CLASSFP20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,geometry
1617,98112,98112,B5,G6350,S,8298455,5385571,47.6375196,-122.2876507,"POLYGON ((-122.31865 47.63049, -122.31861 47.6..."
23666,98104,98104,B5,G6350,S,2448372,477423,47.6004584,-122.3322265,"POLYGON ((-122.34544 47.60317, -122.34292 47.6..."
23677,98125,98125,B5,G6350,S,14051186,2040765,47.7168827,-122.3005643,"POLYGON ((-122.33020 47.72754, -122.32969 47.7..."
23678,98115,98115,B5,G6350,S,16980015,5175191,47.6857531,-122.2837027,"POLYGON ((-122.33017 47.70144, -122.32996 47.7..."
23770,98126,98126,B5,G6350,S,8023087,654474,47.5492206,-122.3743409,"POLYGON ((-122.38266 47.59061, -122.38260 47.5..."


In [172]:
# Change data type of Lat Long to float
Seattle_zcta = Seattle_zcta.astype({'INTPTLAT20': 'float64'})
Seattle_zcta = Seattle_zcta.astype({'INTPTLON20': 'float64'})
Seattle_zcta = Seattle_zcta.astype({'ZCTA5CE20': 'int64'})
Seattle_zcta.dtypes

ZCTA5CE20        int64
GEOID20         object
CLASSFP20       object
MTFCC20         object
FUNCSTAT20      object
ALAND20          int64
AWATER20         int64
INTPTLAT20     float64
INTPTLON20     float64
geometry      geometry
dtype: object

In [173]:
missing_zips = np.setdiff1d(distinct_zip,Seattle_zcta['ZCTA5CE20'].unique())

In [174]:
missing_zips

array([98111, 98114, 98139], dtype=int64)

In [176]:
license_data_category[license_data_category['zip_code'].isin(missing_zips)]

Unnamed: 0,license_issue_date,license_number,animal_s_name,species,primary_breed,secondary_breed,zip_code,AKC name,category,size category
74,2021-02-17T00:00:00.000,26739,Luke,Dog,"Terrier, Rat",,98114,Rat Terrier,terrier,
458,2021-01-16T00:00:00.000,25274,Fenway,Dog,"Retriever, Labrador",,98139,Labrador Retriever,sporting,medium
565,2021-04-26T00:00:00.000,207149,Gabe,Dog,"Retriever, Golden","Poodle, Miniature",98139,Golden Retriever,sporting,medium
944,2020-05-02T00:00:00.000,214637,Max,Dog,Coonhound,,98111,American English Coonhound,hound,large
971,2022-01-25T00:00:00.000,26080,Amie,Dog,"Griffon, Brussels",,98139,Brussels Griffon,toy,small


In [177]:
# There are some Seattle zip codes that are not in the Census ZCTA. The Census web site describes conditions when zip codes are dropped/merged
# drop those rows
license_data_category.drop(index = license_data_category[license_data_category['zip_code'].isin(missing_zips)].index,inplace=True)

Write a GeoJSON file that can then be used by folium.features.GeoJson

In [118]:
geojsonfile_path = 'E:/UserLo/source/repos/learning/Seattle Pets/'
geojsonfile = 'Seattle_zcta.geojson'
Seattle_zcta.to_file(geojsonfile_path+geojsonfile, driver='GeoJSON')

  pd.Int64Index,


Seattle Lat, Long = 47.625440, -122.335892

In [191]:
Seattle_loc = (47.625440, -122.335892)
Seattle_map = folium.Map(location=Seattle_loc, zoom_start=11)

In [194]:
folium.GeoJson(geojsonfile_path+geojsonfile, name="geojson", overlay=False).add_to(Seattle_map)
folium.LayerControl().add_to(Seattle_map)
## Seattle_map

<folium.map.LayerControl at 0x1834e9adf00>

### Add a marker cluster to the map to show dog locations

Add markers for each dog. The text of the marker will be AKC name.

Look up Lat Long in Seattle_zcta

In [195]:
marker_cluster = MarkerCluster()
# Add marker_cluster to current Seattle_map
Seattle_map.add_child(marker_cluster)

# for each row in license_data_category data frame
# create a Marker object with its coordinate
# and customize the Marker's icon property to indicate AKC name
for index, record in license_data_category.iterrows():
    try:
        lat = Seattle_zcta[Seattle_zcta['ZCTA5CE20'] == record['zip_code']]['INTPTLAT20'].iat[0]
        long = Seattle_zcta[Seattle_zcta['ZCTA5CE20'] == record['zip_code']]['INTPTLON20'].iat[0]
    except:
        print (record['zip_code'])
        break
    marker = folium.map.Marker (
        location=(lat,long),
        tooltip = record['AKC name'],
        icon=folium.map.Icon(
            color='white', 
            icon_color = 'red',
            icon_text = record['AKC name']
            )

    )
    marker_cluster.add_child(marker)

Seattle_map