# Yelp Matching IDs
To get Yelp data, one first needs to match the DPH names with Yelp's restaurant ID's

## Part 1: Using Old Data
A 2020 version of the Yelp academic dataset contained Georgia. We can use this to get some ID's to reduce the number
of API calls required. Yelp limits us to 5000 requests/day. Using fuzzy-matching on name and addresses, we get the
best match.

In [1]:
import pandas as pd
import time
from tqdm import tqdm
from yelpapi import YelpAPI

Unnamed: 0,dph_id,violations,years_data,lifetime_count,lifetime_comments,lifetime_avg,violation_rate
0,MTE4MTc1Mg==,"[8-2B, 11A, 15C, 4-2B, 15B, 17A, 17C, 17D, 11A...",2.230137,11,Observed the concentration of the sanitizer in...,96.500000,4.932432
1,MTE4MTc2MA==,[],0.000000,0,,100.000000,0.000000
2,MTE4MTc2Nw==,"[6-1A, 6-1C, 13A]",0.000000,3,See Attachment See Attachment See Attachment,88.000000,3.000000
3,MTE4MTcxOA==,"[2-2D, 4-2B, 15A, 6-1A]",2.460274,4,Observed Handwash sink in bar used as dumpsink...,93.333333,1.625835
4,MTE4MTcyMQ==,[],0.613699,0,,100.000000,0.000000
...,...,...,...,...,...,...,...
7837,OTQ5MzY3NA==,[],0.000000,0,,100.000000,0.000000
7838,OTQ5MzY3NQ==,"[2-2D, 4-2B, 6-2, 15C, 17C, 17C, 18, 4-2B, 17C...",2.893151,23,Observed failure to have hand soap at sushi ba...,89.200000,7.949811
7839,OTQ5MzcwNQ==,"[3-1B, 8-2B, 2-2A, 2-2E, 5-1B, 11A, 3-1B, 11A,...",1.997260,10,46F milk and 65F chicken wrap sandwich. No de...,92.500000,5.006859
7840,OTQ5MzcwNw==,"[6-1A, 6-1B, 10D, 16A, 16B]",2.569863,5,Observed various salsas and sour cream being s...,95.333333,1.945629


In [34]:
gdf = pd.read_pickle('geocoded_df.pkl')
rdf = pd.read_csv('restaurant_db.csv')

In [56]:
from thefuzz import process

def fuzzy_merge(df1, df2, key1, key2, threshold=80, limit=1):
    s = df2[key2].tolist()
    m = df1[key1].apply(lambda x: process.extract(x, s, limit=limit, ))
    df1['matches'] = m
    return df1

# Make everything <name>,<addr> and lower for dph dataframe
gdf['full_name'] = gdf[['name', 'map_add']].apply(lambda x: ', '.join([str(x[0]), str(x[1])]).lower(), axis=1)
gdf

# Do the same thing for yelp dataframe
rdf['full_name'] = rdf[['name', 'address', 'city', 'state', 'postal_code']]\
    .apply(lambda x: ', '.join([str(i).lower() for i in x]), axis=1)
rdf

# Fuzzy merge the datasets (this process is very slow, unfortunately)
fuzzy_merge(gdf, rdf, 'full_name', 'full_name', 80, 1)

# Threshold based on the goodness of the match
good_matches = gdf[gdf.apply(lambda x: x['matches'][0][1] >= 89, axis = 1)]

# Extract the match name into its own column
good_matches['match_name'] = good_matches.apply(lambda x: x['matches'][0][0], axis=1)

# Ensure there are no duplicate matches
uniq_matches = good_matches.drop_duplicates('match_name')
uniq_rdf = rdf.drop_duplicates('full_name')

# This gives us the final dataframe with Yelp business ID's
final_matches = uniq_matches.merge(uniq_rdf[['business_id', 'full_name']], how='inner', left_on='match_name',
                                   right_on='full_name',
                                   validate='one_to_one')

# Clean up unnecessary columns
yelp1 = final_matches.drop(['full_name_x', 'matches', 'match_name', 'full_name_y'], axis=1)

## Part 2: The Remainder
We have no choice but to get the rest from the API

In [None]:
# Locate the entities which we do not yet have a Yelp ID for
remaining_df = pd.merge(gdf, yelp1, indicator=True, how='outer', on='dph_id').query('_merge=="left_only"').drop('_merge', axis=1)
remaining_df = remaining_df[['dph_id', 'map_add_x', 'name_x', 'lat_x', 'lng_x']]

# Eliminate schools, hospitals, prisons, churches, and other institutions
exclude = ['SCHOOL', 'MOBILE', 'BASE', 'CATER', 'ELEMENTARY', 'HOSPITAL', 'JAIL', 'JR. HIGH', 'REHABILITATION',
           'TREATMENT', 'REGIONAL', 'SYNAGOGUE', 'SOCIETY', 'SENIOR', 'CENTER', 'BIBLE', 'CAMPUS', 'RETIREMENT',
           'MANAGEMENT', 'MEDICINE', 'ACADEMY', 'DEPARTMENT']

filtered = remaining_df[remaining_df.apply(lambda x: not any([y in x['name_x'].upper() for y in exclude]), axis=1)]

# Normalize the addresses to use the business-match API endpoint
rows = []
for index, item in filtered.iterrows():
    addr = item['map_add_x'].split(',')
    if len(addr) == 3:
        street = addr[0]
        city = addr[1]
    elif len(addr) == 4:
        street = addr[0]
        city = addr[2]
    else:
        street = addr[0] + addr[1]
        city = addr[-2]
    res = {
        'dph_id': item['dph_id'],
        'name': item['name_x'].lower(),
        'street': street.lower(),
        'city': city.lower(),
        'state': 'GA',
        'country': 'US'
    }
    rows.append(res)

parsed_addr_df = pd.DataFrame.from_records(rows)

### Ask Yelp for the Data
IMPORTANT: Since API Calls are limited, you may need to break this into two separate runs: one for the first half of
the dataframe and another for the second half.

In [None]:
## First Part
# frame = parsed_addr_df.iloc[906:]

## Second Part
frame = parsed_addr_df.iloc[4966:]

api_key = "qOWiNCp8Sblg84q0uiPRXiksxMk2DzMvMRnnSI29PxS-xv9sG9e4yxJ-lPHFaMHaacOJ8uBOZX4T1rY3fhnZTzzy4ZW2m_vT-WhnYgix5RSSc4VnFQw0-lQmQVQxZHYx"
responses = []
for index, entity in tqdm(frame.iterrows(), total=frame.shape[0]):
    try:
        with YelpAPI(api_key) as yelp_api:
            response = yelp_api.business_match_query(
                name=entity['name'],
                address1=entity['street'],
                city=entity['city'],
                state=entity['state'],
                country=entity['country'],
                limit=1,
                match_threshold='strict'
            )
        row = {
            'dph_id': entity['dph_id'],
            'result': response
        }
    except Exception as e:
        row = {
            'dph_id': entity['dph_id'],
            'result': []
        }
        print(e)
    responses.append(row)
    time.sleep(20/1000)

part1_df = pd.DataFrame.from_records(responses)

records = []
for i, r in enumerate(responses):
    record = {
        'dph_id': parsed_addr_df.iloc[i]['dph_id'],
        'yelp_resp': r
    }
    records.append(record)

In [None]:
part1_df = pd.DataFrame.from_records(responses)

In [None]:
part2_df = pd.DataFrame.from_records(responses)

In [None]:
part3_df = pd.DataFrame.from_records(responses)

In [None]:
combined_df = pd.concat([part1_df.rename(columns={'dph_id':'dph_id', 'result':'yelp_resp'}), part2_df.rename
(columns={'dph_id':'dph_id', 'result':'yelp_resp'}), part3_df.rename(columns={'dph_id':'dph_id', 'result':'yelp_resp'})], axis=0)

In [None]:
combined_df.to_pickle('yelp_responses_all.pkl')

In [None]:
# serialize the dataframe
serialized = []

for index, row in combined_df.iterrows():
    try:
        e = row['yelp_resp']['businesses']
        if len(e) != 0:
            s = {
                'dph_id': row['dph_id'],
                'yelp_id': e[0]['id'],
                'yelp_name': e[0]['name']
            }
            serialized.append(s)
    except Exception as e:
        print(row, e, index)

yelp_matches = pd.DataFrame.from_records(serialized)
yelp_matches

### Combine with matches from academic dataset

In [None]:
other_matches = pd.read_pickle('yelp_matched1.pkl')
exclude = ['SCHOOL', 'MOBILE', 'BASE', 'CATER', 'ELEMENTARY', 'HOSPITAL', 'JAIL', 'JR. HIGH', 'REHABILITATION',
           'TREATMENT', 'REGIONAL', 'SYNAGOGUE', 'SOCIETY', 'SENIOR', 'CENTER', 'BIBLE', 'CAMPUS', 'RETIREMENT',
           'MANAGEMENT', 'MEDICINE', 'ACADEMY', 'DEPARTMENT']

other_filtered = other_matches[other_matches.apply(lambda x: not any([y in str(x['name']).upper() for y in exclude]),axis=1)]

In [None]:
combined_all_matches = pd.concat([yelp_matches, other_filtered[['dph_id', 'business_id']].rename
(columns={'dph_id':'dph_id', 'business_id':'yelp_id'})], axis=0)
combined_all_matches.drop_duplicates('yelp_id', inplace=True)
combined_all_matches.to_pickle('all_yelp_ids.pkl')

In [1]:
import pandas as pd
combined_all_matches = pd.read_pickle('all_yelp_ids.pkl')
combined_all_matches

Unnamed: 0,dph_id,yelp_id
0,MTIzMDIzODE=,u_irrlZiHMa7ueeUHqHtvA
1,MTIzMDI4MTI=,39r2fM5P8heVXlLgYkmSaA
2,MTI3NzEzMjA=,QjL9fgvhsJta7SSX7P8YMw
3,MTI3NzE5MzM=,c2Ta5My44Uz4tT_jrq6u5w
4,MTI3NzIwOTA=,lhxEztxsao9WfyfWuSpJfw
...,...,...
1574,Nzk1Mzg0NQ==,iJ1vqcHFhDSR-vunw7np-w
1575,OTQ4NDkzOQ==,g0KkQmGXCtxkzHWoYEQa4A
1576,ODg2NDA0,RE8bmfnqfcYEpbUHo3dYVQ
1577,Nzk1NjQzMw==,t4M0yOSVLz59H7JQi3FUSQ
