# 2.2 Location normalizer

**Goal**: take the location data in our dataset and turn it into something more usable for matching. This notebook categorizes cities by size using US census data.

In [None]:
# import packages
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from glob import glob
from collections import Counter

ERROR: Error in parse(text = x, srcfile = src): <text>:2:8: unexpected symbol
1: # import packages
2: import numpy
          ^


Load the necessary datasets -- our freelancer dataset and the US census bureau dataset (retrieved from [this website](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/?dataChart=eyJxdWVyaWVzIjpbeyJjb25maWciOnsiZGF0YXNldCI6InVzLWNpdGllcy1kZW1vZ3JhcGhpY3MiLCJvcHRpb25zIjp7fX0sImNoYXJ0cyI6W3siYWxpZ25Nb250aCI6dHJ1ZSwidHlwZSI6ImNvbHVtbiIsImZ1bmMiOiJBVkciLCJ5QXhpcyI6Im1lZGlhbl9hZ2UiLCJzY2llbnRpZmljRGlzcGxheSI6dHJ1ZSwiY29sb3IiOiIjRkY1MTVBIn1dLCJ4QXhpcyI6ImNpdHkiLCJtYXhwb2ludHMiOjUwLCJzb3J0IjoiIn1dLCJ0aW1lc2NhbGUiOiIiLCJkaXNwbGF5TGVnZW5kIjp0cnVlLCJhbGlnbk1vbnRoIjp0cnVlfQ%3D%3D) with public domain license).

In [None]:
# loading our freelancer data
df = pd.read_csv('/work/DS4SG-Gender-Inequality/data/gender-annotated/cleaned-gender-annotated-v5.csv', low_memory=False)

df.head()

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs,join_date_from_earliest,badge_preferred_freelancer,badge_verified
0,designer,Milen,male,https://www.freelancer.com/u/MsCaddServices,Edmonds,45,0.0,0.0,0,0,...,,,,,,,,7063,False,False
1,designer,Jeremy,male,https://www.freelancer.com/u/Conescu,Orinda,90,0.0,0.0,0,0,...,,,,,,,,7526,False,False
2,designer,Nichole,female,https://www.freelancer.com/u/NicholeMW,Holly,25,4.0,5.0,2,0,...,,,,,,,,6430,False,False
3,designer,Robert,male,https://www.freelancer.com/u/rhoenig1277,Beloit,75,0.0,0.0,0,0,...,,,,,,,,3238,False,False
4,designer,Jean-Paul,male,https://www.freelancer.com/u/PaulCarriazo,Miami,19,0.0,0.0,0,0,...,,,,,,,,6661,False,False


In [None]:
# loading the US city size data
cities = pd.read_csv('/work/DS4SG-Gender-Inequality/data/cities/uscities.csv')

cities.head()

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
0,New York,New York,NY,New York,36061,New York,40.6943,-73.9249,18713220,10715,polygon,False,True,America/New_York,1,11229 11226 11225 11224 11222 11221 11220 1138...,1840034016
1,Los Angeles,Los Angeles,CA,California,6037,Los Angeles,34.1139,-118.4068,12750807,3276,polygon,False,True,America/Los_Angeles,1,90291 90293 90292 91316 91311 90037 90031 9000...,1840020491
2,Chicago,Chicago,IL,Illinois,17031,Cook,41.8373,-87.6862,8604203,4574,polygon,False,True,America/Chicago,1,60018 60649 60641 60640 60643 60642 60645 6064...,1840000494
3,Miami,Miami,FL,Florida,12086,Miami-Dade,25.7839,-80.2102,6445545,5019,polygon,False,True,America/New_York,1,33129 33125 33126 33127 33128 33149 33144 3314...,1840015149
4,Dallas,Dallas,TX,Texas,48113,Dallas,32.7936,-96.7662,5743938,1526,polygon,False,True,America/Chicago,1,75287 75098 75233 75254 75251 75252 75253 7503...,1840019440


In [None]:
# count occurrences of cities
counts = Counter(cities["city"])

# cities with the same name but in different states
repeated_cities = [key for key, val in counts.items() if val > 1]

# make new df with all non-repeated cities
new_cities = cities[~cities["city"].isin(repeated_cities)].copy()

# for repeated cities, keep the city with the largest population in the dataset
for city in repeated_cities:

    # get all repeated city data for :city:
    city_data = cities[cities["city"] == city]

    # sort by pop
    city_data = city_data.sort_values(by=["population"], ascending=False)
    
    # get the data from the highest pop city and append to dataframe
    top_pop_city = city_data.iloc[0, :]
    new_cities.loc[len(new_cities.index)] = top_pop_city
    new_cities.reset_index(inplace=True, drop=True)

new_cities.sort_values(by=["population"], ascending=False)

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
0,New York,New York,NY,New York,36061,New York,40.6943,-73.9249,18713220,10715,polygon,False,True,America/New_York,1,11229 11226 11225 11224 11222 11221 11220 1138...,1840034016
1,Los Angeles,Los Angeles,CA,California,6037,Los Angeles,34.1139,-118.4068,12750807,3276,polygon,False,True,America/Los_Angeles,1,90291 90293 90292 91316 91311 90037 90031 9000...,1840020491
2,Chicago,Chicago,IL,Illinois,17031,Cook,41.8373,-87.6862,8604203,4574,polygon,False,True,America/Chicago,1,60018 60649 60641 60640 60643 60642 60645 6064...,1840000494
15463,Miami,Miami,FL,Florida,12086,Miami-Dade,25.7839,-80.2102,6445545,5019,polygon,False,True,America/New_York,1,33129 33125 33126 33127 33128 33149 33144 3314...,1840015149
15464,Dallas,Dallas,TX,Texas,48113,Dallas,32.7936,-96.7662,5743938,1526,polygon,False,True,America/Chicago,1,75287 75098 75233 75254 75251 75252 75253 7503...,1840019440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15458,Portage Creek,Portage Creek,AK,Alaska,2070,Dillingham,58.9051,-157.6695,3,0,polygon,False,False,America/Anchorage,3,99576,1840023660
15461,The Ranch,The Ranch,MN,Minnesota,27087,Mahnomen,47.3198,-95.6952,2,2,polygon,False,True,America/Chicago,3,56557,1840039629
15459,Gross,Gross,NE,Nebraska,31015,Boyd,42.9461,-98.5697,2,6,polygon,False,True,America/Chicago,3,68719,1840011032
15460,Lotsee,Lotsee,OK,Oklahoma,40143,Tulsa,36.1334,-96.2091,2,39,polygon,False,True,America/Chicago,3,74063,1840021674


In [None]:
SIZE_XSMALL = 0
SIZE_SMALL = 1
SIZE_MEDIUM = 2
SIZE_LARGE = 3
SIZE_XLARGE = 4
SIZE_XXLARGE = 5

def get_size_tier(city):
    population = new_cities[new_cities["city"] == city]["population"].to_numpy()

    # we assume that if we don't recognize the city,
    # then it's a very small city
    if len(population) == 0:
        return SIZE_XSMALL
    else:
        population = population[0]
    
    # classify cities based on their population

    # greater than 2.5 million
    if population > 2.5 * 10**6:
        return SIZE_XXLARGE
    # greater than 1 million
    elif population > 1 * 10**6:
        return SIZE_XLARGE
    # greater than 500,000
    elif population > 500000:
        return SIZE_LARGE
    # greater than 100,000
    elif population > 100000:
        return SIZE_MEDIUM
    # greater than 10,000
    elif population > 10000:
        return SIZE_SMALL
    # less than 10,000
    else:
        return SIZE_XSMALL
    

# example runs
print("New York:", get_size_tier("New York"))
# Miami gets classified as tier 5 because we always assume it is Miami FL (the biggest)
print("Miami:", get_size_tier("Miami"))
print("Kansas City:", get_size_tier("Kansas City"))
print("Carmel:", get_size_tier("Carmel"))
print("Wendover:", get_size_tier("Wendover"))

New York: 5
Miami: 5
Kansas City: 4
Carmel: 2
Wendover: 0


In [None]:
# append location size to dataset
location_size = df["location"].apply(get_size_tier)
df['location_size'] = location_size

# reorganize dataframe so location_size is next to location
df.insert(int(np.where(df.columns == 'location')[0][0]+1),'location_size',df.pop('location_size'))

df.head()

Unnamed: 0,search_query,name,gender,profile_link,location,location_size,hourly_rate,pay_grade,avg_rating,num_reviews,...,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs,join_date_from_earliest,badge_preferred_freelancer,badge_verified
0,designer,Milen,male,https://www.freelancer.com/u/MsCaddServices,Edmonds,1,45,0.0,0.0,0,...,,,,,,,,7063,False,False
1,designer,Jeremy,male,https://www.freelancer.com/u/Conescu,Orinda,1,90,0.0,0.0,0,...,,,,,,,,7526,False,False
2,designer,Nichole,female,https://www.freelancer.com/u/NicholeMW,Holly,0,25,4.0,5.0,2,...,,,,,,,,6430,False,False
3,designer,Robert,male,https://www.freelancer.com/u/rhoenig1277,Beloit,1,75,0.0,0.0,0,...,,,,,,,,3238,False,False
4,designer,Jean-Paul,male,https://www.freelancer.com/u/PaulCarriazo,Miami,5,19,0.0,0.0,0,...,,,,,,,,6661,False,False


In [None]:
# export new dataset
df.to_csv('../data/interim/location-cleaned.csv',index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>