In [1]:
import pandas as pd
import csv
from sklearn.preprocessing import LabelEncoder
from sqlalchemy import create_engine
# from config import db_password -- for Postgres
from geopy.geocoders import Nominatim

pd.set_option("display.max_rows", None)

# File to Load
dataset_one_to_load = "Resources/AnimalIntakeWithResultsExtended2018.csv"

In [2]:
#THIS IS WITH 2018 CSV FILE ONLY - ALL THREE YEARS MUST BE JOINED WITH SQL IN DB

# Read the intake data and store into a Pandas DataFrame
dataset_one_df = pd.read_csv(dataset_one_to_load)

In [3]:
dataset_one_df.head()

Unnamed: 0,Animal #,ARN,Animal Name,Animal Type,Species,Primary Breed,Secondary Breed,Distinguishing Markings,Gender,Altered,...,Outcome Agency Name,Agency Street Address,Agency Unit Number,Agency City,Agency Province,Agency Postal Code,Agency Email,Agency Home Phone,Agency Cell Number,RN
0,A38478589,c8548,Raven,Cat,Cat,Domestic Shorthair,Mix,,F,Yes,...,,,,,,,,,,1
1,A38485565,C8549,Cleo,Cat,Cat,Domestic Shorthair,Mix,,F,No,...,,,,,,,,,,1
2,A38485698,D2207,Ollie,Dog,Dog,American Blue Heeler,Mix,,M,Yes,...,,,,,,,,,,1
3,A38485738,D2208,Ernest,Dog,Dog,American Blue Heeler,Mix,,M,Yes,...,,,,,,,,,,1
4,A38548723,D2209,Bentley,Dog,Dog,Basset Hound,Mix,,M,No,...,,,,,,,,,,1


In [4]:
dataset_one_df.dtypes

Animal #                    object
ARN                         object
Animal Name                 object
Animal Type                 object
Species                     object
Primary Breed               object
Secondary Breed             object
Distinguishing Markings     object
Gender                      object
Altered                     object
Danger                      object
Danger Reason              float64
Date Of Birth               object
Age in Months Intake       float64
Age Group                   object
Intake Asilomar Status      object
Intake Condition            object
Intake Record Owner         object
Intake Date                 object
Intake Type                 object
Intake Subtype              object
Found Address               object
Found Zip Code             float64
Reason                      object
Intake SiteName            float64
Jurisdiction In             object
Agency Name                 object
Agency Member               object
Agency Member Phone 

In [5]:
#Change postal codes from float64 to string

dataset_one_df['Found Zip Code'] = dataset_one_df['Found Zip Code'].map('{:.0f}'.format)
dataset_one_df['Out Postal Code'] = dataset_one_df['Out Postal Code'].map('{:.0f}'.format)

In [6]:
#Encode columns for gender and altered with sklearn labelencoder

le = LabelEncoder()
df = dataset_one_df.copy()
df['Altered'] = le.fit_transform(df['Altered'])

In [7]:
df['Gender'] = le.fit_transform(df['Gender'])

In [8]:
df['Species'] = le.fit_transform(df['Species'])

In [9]:
# Drop non-beneficial columns

df = df.drop(columns=['ARN','Animal Name','Distinguishing Markings', "Animal #", "Danger", "Danger Reason", "Date Of Birth", "Intake Asilomar Status"])

In [10]:
df = df.drop(columns=['Intake Record Owner', 'Agency Member', 'Agency Member Phone', 'Agency Address', 'Intake Person ID Type', 'Intake Person #'])


In [11]:
df = df.drop(columns=['Admitter', 'Street Address', 'Unit Number', 'City', 'Province', 'Postal Code', "Admitter's Email", "Admitter's Home Phone", "Admitter's Cell Phone", 'Initial Stage'])


In [12]:
df = df.drop(columns=['Microchip Issue Date', 'Microchip Provider', 'Microchip Number', 'Pet ID', 'Pet ID Type', 'Status'])

In [13]:
df = df.drop(columns=['Stage', 'Location', 'Sublocation', 'Outcome Asilomar Status', 'Outcome Number', 'Released By', 'Date Created', 'Outcome SiteName', 'Jurisdiction Out', 'Outcome Person ID'])


In [14]:
df = df.drop(columns=['Outcome Person ID Type', 'Outcome Person #', 'Outcome Person Name', 'Out Unit Number', 'Out Email',  'Age in Months Current'])

In [15]:
df = df.drop(columns=['Animal Type','Out Home Phone', 'Out Cell Phone', 'Agency Unit Number', 'Agency City', 'Agency Province', 'Agency Postal Code', 'Agency Email', 'Agency Home Phone', 'RN', 'Intake SiteName', 'Jurisdiction In', 'Agency Street Address', 'Agency Cell Number', 'Intake Person ID', 'Initial Review Date'])

In [16]:
# Pie charts? Bar charts by years? show percentage of Strays turned in by ACO/Police, Public, Born in HSWC
# Show percent strays altered/adopted/euthanized
# Show percent of surgeries that are in clinic vs. resident population
# Show percent Owner Surrender adopted/transferred out/euthanized

in_out_counts_df = df.groupby(['Intake Type',  'Intake Subtype','Outcome Type', 'Outcome Subtype']).size().reset_index(name='counts')
in_out_counts_df

Unnamed: 0,Intake Type,Intake Subtype,Outcome Type,Outcome Subtype,counts
0,Clinic,Microchip,Clinic Out,Microchip,2
1,Clinic,Neuter,Clinic Out,Other,14
2,Clinic,Spay,Clinic Out,Other,13
3,Clinic,Vaccinations,Clinic Out,Vaccinations,4
4,Owner/Guardian Surrender,Born in Care,Transfer Out,For Adoption,2
5,Owner/Guardian Surrender,Euthanasia Owner Request,Euthanasia,Disease - Chronic,4
6,Owner/Guardian Surrender,Euthanasia Owner Request,Euthanasia,Disease - Other Fatal,1
7,Owner/Guardian Surrender,Euthanasia Owner Request,Euthanasia,Owner Request,3
8,Owner/Guardian Surrender,For Adoption,Adoption,Adoption Center,309
9,Owner/Guardian Surrender,For Adoption,Died,Unknown,4


In [17]:
# this is just a list of partner agencies, most of which accept "Transfer Out/For Adoption" animals, won't use in
# names in statistical analysis, but % of Transfers would be helpful and whether or not spayed/neutered before Xfer

df.rename(columns = {"Outcome Agency Name": "Outcome_Agency_Name"}, inplace=True)
df.Outcome_Agency_Name.unique()

array([nan, 'Orphan Animal Rescue - OARS',
       'Langlade County Humane Society, Inc.',
       'K & R Small Animal Sanctuary', 'Wisconsin Boston Terrier Rescue',
       'Great Dane Rescue of Minnesota and Wisconsin',
       'Safe Haven Pet Sanctuary Inc (Cafe)', "Chrissy's K9 Kastle",
       'Bichon and Little Buddies Rescue', 'Green Bay Animal Rescue',
       'Neenah Animal Shelter', 'Green Lake Area Animal Shelter, LTD',
       'Humane Society of Sheboygan County',
       'Unforgettable Underdogs Dog Rescue',
       'Portage County Humane Society', 'Oshkosh Area Humane Society'],
      dtype=object)

In [18]:
# create separate dataframes for found address and adopted address, process these dataframes separately from df
# remove nans before converting to lat/long (not important to keep index of df)

found_address = df[['Found Address', 'Found Zip Code']]
found_address_df = pd.DataFrame(found_address)
found_address_df

Unnamed: 0,Found Address,Found Zip Code
0,,54961.0
1,,54983.0
2,,54456.0
3,,54456.0
4,412 Wisconsin St,54981.0
5,Granite & North St,54981.0
6,Born at HSWC,54981.0
7,Born at HSWC,54981.0
8,Born at HSWC,54981.0
9,Born at HSWC,54981.0


In [19]:
# separate adopted address dataframe
adopted_address = df[['Out Street Address','Out City','Out Province','Out Postal Code']]
adopted_address_df = pd.DataFrame(adopted_address)
adopted_address_df

Unnamed: 0,Out Street Address,Out City,Out Province,Out Postal Code
0,N3762 Maple Grove Road,Weyauwega,WI,54983.0
1,,,,
2,S2888 Waumandee Creek Road,Fountain City,WI,54629.0
3,E3499 Tanner Road,Ogdensburg,WI,54962.0
4,409 Scott Street,Waupaca,WI,54981.0
5,404 Scott Street,Waupaca,WI,54981.0
6,,,,
7,,,,
8,,,,
9,,,,


In [20]:
# remove Nans from addresses dfs, see how many rows are left

found_address_df = found_address_df.dropna()
adopted_address_df = adopted_address_df.dropna()

found_address_df

Unnamed: 0,Found Address,Found Zip Code
4,412 Wisconsin St,54981.0
5,Granite & North St,54981.0
6,Born at HSWC,54981.0
7,Born at HSWC,54981.0
8,Born at HSWC,54981.0
9,Born at HSWC,54981.0
10,Born at HSWC,54981.0
11,1222 E. Clark st,54981.0
14,South Park,54981.0
17,Granit St.,54981.0


In [21]:
# found_counts_df = found_address_df.groupby(['Found Address']).size().reset_index(name='counts')
# found_counts_df

In [22]:
# these are the datapoints that will be mapped to show where adopters are located, columns need to be concat to
#run in geolocater to get lat/long

adopted_address_df

Unnamed: 0,Out Street Address,Out City,Out Province,Out Postal Code
0,N3762 Maple Grove Road,Weyauwega,WI,54983
2,S2888 Waumandee Creek Road,Fountain City,WI,54629
3,E3499 Tanner Road,Ogdensburg,WI,54962
4,409 Scott Street,Waupaca,WI,54981
5,404 Scott Street,Waupaca,WI,54981
11,E1635 Cty Hwy C,Iola,WI,54945
12,1209 W Nicholet Road,Appleton,WI,54914
13,N5517 County Rd I,Fremont,WI,54940
14,E995 Nottleson Road,Scandinavia,WI,54977
15,412 Lind Street,Fremont,WI,54940


In [23]:
# drop individual address columns from original df

df = df.drop(columns=['Found Address', 'Found Zip Code', 'Out Street Address', 'Out City', 'Out Province', 'Out Postal Code'])

In [24]:
# 19 columns remain in original df
# check list of columns to see what remains
my_list = df.columns.values.tolist()
my_list

['Species',
 'Primary Breed',
 'Secondary Breed',
 'Gender',
 'Altered',
 'Age in Months Intake',
 'Age Group',
 'Intake Condition',
 'Intake Date',
 'Intake Type',
 'Intake Subtype',
 'Reason',
 'Agency Name',
 'Outcome Date',
 'Release Date',
 'Outcome Type',
 'Outcome Subtype',
 'Outcome Reason',
 'Outcome_Agency_Name']

In [25]:
df.describe()

Unnamed: 0,Species,Gender,Altered,Age in Months Intake
count,574.0,574.0,574.0,504.0
mean,0.609756,0.54007,0.902439,17.944444
std,0.551984,0.516023,0.296979,34.178413
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,1.0
50%,1.0,1.0,1.0,2.0
75%,1.0,1.0,1.0,18.0
max,3.0,2.0,1.0,204.0


In [26]:
#Rename column headers

found_address_df.rename(columns = {"Found Address": "Found_Address"}, inplace=True)
found_address_df.rename(columns = {"Found Zip Code": "Found_Zip_Code"}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
# Remove non-address addresses, such as Born at HSWC, Mom is c8555, in found_address
## CAN'T GET THIS TO WORK!!!

#found_address_df[found_address_df.Found_Address != "Born at HSWC"]
found_address_df[found_address_df["Found_Address"].str.contains("Mom is C8555")==False]
found_address_df

Unnamed: 0,Found_Address,Found_Zip_Code
4,412 Wisconsin St,54981.0
5,Granite & North St,54981.0
6,Born at HSWC,54981.0
7,Born at HSWC,54981.0
8,Born at HSWC,54981.0
9,Born at HSWC,54981.0
10,Born at HSWC,54981.0
11,1222 E. Clark st,54981.0
14,South Park,54981.0
17,Granit St.,54981.0


In [28]:
# Using separate cleaned address dataframes, concat separate address columns into one for lat/long conversion

found_address_df["clean_found_address"] = found_address_df['Found_Address'].map(str) + '  ' + found_address_df['Found_Zip_Code'].map(str)
found_address_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Found_Address,Found_Zip_Code,clean_found_address
4,412 Wisconsin St,54981.0,412 Wisconsin St 54981
5,Granite & North St,54981.0,Granite & North St 54981
6,Born at HSWC,54981.0,Born at HSWC 54981
7,Born at HSWC,54981.0,Born at HSWC 54981
8,Born at HSWC,54981.0,Born at HSWC 54981
9,Born at HSWC,54981.0,Born at HSWC 54981
10,Born at HSWC,54981.0,Born at HSWC 54981
11,1222 E. Clark st,54981.0,1222 E. Clark st 54981
14,South Park,54981.0,South Park 54981
17,Granit St.,54981.0,Granit St. 54981


In [29]:
adopted_address_df['clean_adopted_address'] = adopted_address_df['Out Street Address'].map(str) + ', ' + adopted_address_df['Out City'].map(str) + ', ' + adopted_address_df['Out Province'].map(str) + ', ' + adopted_address_df['Out Postal Code'].map(str)
adopted_address_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Out Street Address,Out City,Out Province,Out Postal Code,clean_adopted_address
0,N3762 Maple Grove Road,Weyauwega,WI,54983,"N3762 Maple Grove Road, Weyauwega, WI, 54983"
2,S2888 Waumandee Creek Road,Fountain City,WI,54629,"S2888 Waumandee Creek Road, Fountain City, WI,..."
3,E3499 Tanner Road,Ogdensburg,WI,54962,"E3499 Tanner Road, Ogdensburg, WI, 54962"
4,409 Scott Street,Waupaca,WI,54981,"409 Scott Street, Waupaca, WI, 54981"
5,404 Scott Street,Waupaca,WI,54981,"404 Scott Street, Waupaca, WI, 54981"
11,E1635 Cty Hwy C,Iola,WI,54945,"E1635 Cty Hwy C, Iola, WI, 54945"
12,1209 W Nicholet Road,Appleton,WI,54914,"1209 W Nicholet Road, Appleton, WI, 54914"
13,N5517 County Rd I,Fremont,WI,54940,"N5517 County Rd I, Fremont, WI, 54940"
14,E995 Nottleson Road,Scandinavia,WI,54977,"E995 Nottleson Road, Scandinavia, WI, 54977"
15,412 Lind Street,Fremont,WI,54940,"412 Lind Street, Fremont, WI, 54940"


In [30]:
#this may be used on separate addresses dataframes
#EXAMPLE INPUT ADDRESS/OUTPUT

geolocator = Nominatim(timeout=10, user_agent = "myGeolocator")
location = geolocator.geocode('4550 Kester Mill Rd,Winston-Salem,NC')
print(location)
print((location.latitude, location.longitude))

Walmart Supercenter, 4550, Kester Mill Road, Winston-Salem, Forsyth County, North Carolina, 27103, United States
(36.06752315, -80.3372069310351)


In [31]:
#use GeoPy/Nominatim to convert Found_Address and Adopter_Address to lat/long
# add columns to store lat/long 

found_address_df['found_geocode'] = found_address_df.clean_found_address.apply(geolocator.geocode)
adopted_address_df['adopted_geocode']= adopted_address_df.clean_adopted_address.apply(geolocator.geocode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [32]:
found_address_df

Unnamed: 0,Found_Address,Found_Zip_Code,clean_found_address,found_geocode
4,412 Wisconsin St,54981.0,412 Wisconsin St 54981,"(412, Wisconsin Street, Waupaca, Waupaca Count..."
5,Granite & North St,54981.0,Granite & North St 54981,"(Granite Street, Waupaca, Waupaca County, Wisc..."
6,Born at HSWC,54981.0,Born at HSWC 54981,
7,Born at HSWC,54981.0,Born at HSWC 54981,
8,Born at HSWC,54981.0,Born at HSWC 54981,
9,Born at HSWC,54981.0,Born at HSWC 54981,
10,Born at HSWC,54981.0,Born at HSWC 54981,
11,1222 E. Clark st,54981.0,1222 E. Clark st 54981,"(E, Clark Street, North Andover, Essex County,..."
14,South Park,54981.0,South Park 54981,"(South Park, Waupaca, Waupaca County, Wisconsi..."
17,Granit St.,54981.0,Granit St. 54981,


In [33]:
adopted_address_df

Unnamed: 0,Out Street Address,Out City,Out Province,Out Postal Code,clean_adopted_address,adopted_geocode
0,N3762 Maple Grove Road,Weyauwega,WI,54983,"N3762 Maple Grove Road, Weyauwega, WI, 54983",
2,S2888 Waumandee Creek Road,Fountain City,WI,54629,"S2888 Waumandee Creek Road, Fountain City, WI,...",
3,E3499 Tanner Road,Ogdensburg,WI,54962,"E3499 Tanner Road, Ogdensburg, WI, 54962",
4,409 Scott Street,Waupaca,WI,54981,"409 Scott Street, Waupaca, WI, 54981","(409, Scott Street, Waupaca, Waupaca County, W..."
5,404 Scott Street,Waupaca,WI,54981,"404 Scott Street, Waupaca, WI, 54981","(404, Scott Street, Waupaca, Waupaca County, W..."
11,E1635 Cty Hwy C,Iola,WI,54945,"E1635 Cty Hwy C, Iola, WI, 54945",
12,1209 W Nicholet Road,Appleton,WI,54914,"1209 W Nicholet Road, Appleton, WI, 54914",
13,N5517 County Rd I,Fremont,WI,54940,"N5517 County Rd I, Fremont, WI, 54940",
14,E995 Nottleson Road,Scandinavia,WI,54977,"E995 Nottleson Road, Scandinavia, WI, 54977",
15,412 Lind Street,Fremont,WI,54940,"412 Lind Street, Fremont, WI, 54940","(412, Lind Street, Fremont, Waupaca County, Wi..."


In [34]:
found_address_df.dtypes

Found_Address          object
Found_Zip_Code         object
clean_found_address    object
found_geocode          object
dtype: object

## Address conversion to lat/long - need to deal with Nones (TRIED ABOVE!)

In [35]:
# obtain lat/long for each address

found_address_df['found_lat'] = [g.latitude if g is not None else 0 for g in found_address_df.found_geocode]
found_address_df['found_long'] = [g.longitude if g is not None else 0 for g in found_address_df.found_geocode]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [36]:
found_address_df.head()

Unnamed: 0,Found_Address,Found_Zip_Code,clean_found_address,found_geocode,found_lat,found_long
4,412 Wisconsin St,54981,412 Wisconsin St 54981,"(412, Wisconsin Street, Waupaca, Waupaca Count...",44.362289,-89.090603
5,Granite & North St,54981,Granite & North St 54981,"(Granite Street, Waupaca, Waupaca County, Wisc...",44.360147,-89.088948
6,Born at HSWC,54981,Born at HSWC 54981,,0.0,0.0
7,Born at HSWC,54981,Born at HSWC 54981,,0.0,0.0
8,Born at HSWC,54981,Born at HSWC 54981,,0.0,0.0


In [37]:
adopted_address_df['adopted_lat'] = [g.latitude if g is not None else 0 for g in adopted_address_df.adopted_geocode]
adopted_address_df['adopted_long'] = [g.longitude if g is not None else 0 for g in adopted_address_df.adopted_geocode]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
adopted_address_df.head()

Unnamed: 0,Out Street Address,Out City,Out Province,Out Postal Code,clean_adopted_address,adopted_geocode,adopted_lat,adopted_long
0,N3762 Maple Grove Road,Weyauwega,WI,54983,"N3762 Maple Grove Road, Weyauwega, WI, 54983",,0.0,0.0
2,S2888 Waumandee Creek Road,Fountain City,WI,54629,"S2888 Waumandee Creek Road, Fountain City, WI,...",,0.0,0.0
3,E3499 Tanner Road,Ogdensburg,WI,54962,"E3499 Tanner Road, Ogdensburg, WI, 54962",,0.0,0.0
4,409 Scott Street,Waupaca,WI,54981,"409 Scott Street, Waupaca, WI, 54981","(409, Scott Street, Waupaca, Waupaca County, W...",44.361261,-89.090254
5,404 Scott Street,Waupaca,WI,54981,"404 Scott Street, Waupaca, WI, 54981","(404, Scott Street, Waupaca, Waupaca County, W...",44.361227,-89.090024


In [39]:
#save cleaned df to csv
# save addresses dfs to separate csv files

df.to_csv('first_clean_intake.csv', index=False)

In [40]:
#db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/HSWC_database"
#engine = create_engine(db_string)

In [41]:
#df.to_sql(name='Intake_Demo_Table', con=engine, if_exists='replace')

In [42]:
# df = data.loc[(data.record_sub_type == 'Neuter') | (data.record_sub_type == 'Spay')]
# df.head()