### 1.0 Import modules

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from rapidfuzz import process, utils as fuzz_utils
GOOGLE_SHEET_MASTER_LIST_URL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSWbwrsqF-c---4lfw0LZWymd-f8sy8sLYkXgzh0OyeGATWwrvv7V1Mq5BcApn7F_-WYKP1KXy5shKw/pub?gid=240503925&single=true&output=csv'


### 1.1 Load the data into data frames


In [None]:
master_list_schools_df = pd.read_csv(GOOGLE_SHEET_MASTER_LIST_URL,usecols={'id','california_oid','phone','participating','rowid','school','district_y','address_y'})
# master_list_schools_df.columns# master_list_schools_df = 

master_list_schools_df

In [None]:
california_schools_data = "../data/CDESchoolDirectoryExportAugust2022.csv"

california_schools = pd.read_csv(california_schools_data,
    usecols={'CDS Code','County','Status','District','School',"Closed Date","Website","Latitude","Longitude","Last Update",'Street Address','Street City',"Phone","Email"},encoding='latin')
california_schools.columns = ["oid","county","district","name","status","closed_date","website","latitude","longitude","last_update","address","city","default_phone","email"]

california_schools = california_schools.fillna('')
la_schools_df = california_schools[california_schools["county"] == "Los Angeles"]

la_schools_df['original_name'] = la_schools_df['name']
name = la_schools_df['name'].str.lower().str.replace("[[:punct:]]","-",regex=True).replace({'\([a-zA-Z\s\.\-\/]*\(*[a-zA-Z\s\.\-\/]*\)*[a-zA-Z\s\.\-\/]*\)$'},'',regex=True)

la_schools_df.loc[la_schools_df['district'] == "Los Angeles Unified", 'district'] = "LAUSD"
simple_district = la_schools_df['district'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_address = la_schools_df['address'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')

la_schools_df['simple_name'] = simple_address+"-"+name+"-"+simple_district
la_schools_df['simple_name'].fillna('none',inplace=True)

la_schools_df['oid_key'] = la_schools_df['oid'].astype(str).str.replace(".0","")

la_schools_df
# master_list_subset_with_oid['oid_key'] = master_list_subset_with_oid['california_oid'].astype(str).str.replace(".0","")


### 2.0 Subset and join data with California IDs


In [None]:
# master_list_subset_no_oid
master_list_subset_with_oid = master_list_schools_df[master_list_schools_df['california_oid'].notnull()]
master_list_subset_with_oid
master_list_subset_with_oid['oid_key'] = master_list_subset_with_oid['california_oid'].astype(str).str.replace(".0","")
master_list_subset_with_oid
inner_join_master_list_with_schools = pd.merge(master_list_subset_with_oid, la_schools_df, left_on='oid_key', right_on='oid_key', how='inner')
inner_join_master_list_with_schools

In [None]:
# inner_join_master_list_with_schools.loc[inner_join_master_list_with_schools['phone'].isnull(), 'participating'] 
inner_join_master_list_with_schools.loc[inner_join_master_list_with_schools['phone'].isnull()]
# null_phone_df  = inner_join_master_list_with_schools[inner_join_master_list_with_schools['phone'].str.len() < 15]

inner_join_master_list_with_schools.loc[inner_join_master_list_with_schools['phone'].isnull(), 'phone'] = inner_join_master_list_with_schools['default_phone']
# null_phone_df  = inner_join_master_list_with_schools[inner_join_master_list_with_schools['phone'].str.contains("No Data")]
# null_phone_df
# inner_join_master_list_with_schools.columns
inner_join_master_list_with_schools_df_to_combine = inner_join_master_list_with_schools[['california_oid','phone','participating','rowid','school','district_y','address_y','status','latitude','longitude','email','website']]
inner_join_master_list_with_schools_df_to_combine

### 3.0 Subset the data with no California IDs


In [None]:
master_list_subset_no_oid = master_list_schools_df[master_list_schools_df['california_oid'].isnull()]
name = master_list_subset_no_oid['school'].str.replace("[[:punct:]]","-",regex=True).replace({'\([a-zA-Z\s\.\-\/]*\(*[a-zA-Z\s\.\-\/]*\)*[a-zA-Z\s\.\-\/]*\)$'},'',regex=True)

simple_district = master_list_subset_no_oid['district_y'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_address = master_list_subset_no_oid['address_y'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_name = name.str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','').str.split().str.get(0)
key = simple_address+"-"+simple_name
clean_key = key.str.replace("---","-")
master_list_subset_no_oid['tap_key'] = clean_key
master_list_subset_no_oid['tap_key'].fillna('none',inplace=True)
master_list_subset_no_oid

### 3.1 Merge the data with no IDs using `rapid fuzzy` 

In [None]:

def fuzzy_merge(baseFrame, compareFrame, baseKey, compareKey, threshold=86, limit=1, how='left'):
    s_mapping = {x: fuzz_utils.default_process(x) for x in compareFrame[compareKey]}

    m1 = baseFrame[baseKey].apply(lambda x: process.extract(
      fuzz_utils.default_process(x), s_mapping, limit=limit, score_cutoff=threshold, processor=None
    ))
    baseFrame['Match'] = m1

    m2 = baseFrame['Match'].apply(lambda x: ', '.join(i[2] for i in x))
    baseFrame[baseKey] = m2.replace("",np.nan)

    return baseFrame.merge(compareFrame, left_on=baseKey, right_on=compareKey, how=how)

merged_df = fuzzy_merge(master_list_subset_no_oid, la_schools_df, 'tap_key', 'simple_name',how='left')
# merged_df = fuzzy_merge(la_schools_df, go_pass_schools 'name', 'simple_name',how='right')
merged_df

### 3.2 Clean up the results


In [None]:
# merged_df.columns
# merged_df_to_combine = merged_df[['id','california_oid','phone','participating','rowid','school','district_y','address_y','latitude','longitude','email']]
merged_df.loc[merged_df['phone'].isnull(), 'phone'] = merged_df['default_phone']
merged_df_to_combine = merged_df[['phone','participating','rowid','school','district_y','address_y','Match','oid','status','latitude','longitude','email','website']]

merged_df_to_combine.rename(columns = {'oid':'california_oid'}, inplace = True)
merged_df_to_combine


# merged_df_to_combine

### 4.0 Combining final data frame

In [None]:
combined_df = pd.concat([inner_join_master_list_with_schools_df_to_combine, merged_df_to_combine], ignore_index=True)
combined_df.reset_index(drop=True, inplace=True)
combined_df.index.names = ['id']
combined_df['california_oid'] = combined_df['california_oid'].astype(str).str.replace(".0","")
combined_df['rowid'] = combined_df['rowid'].astype(str).str.replace(".0","")
combined_df

### 5.0 Writing JSON and CSV Outputs

In [None]:
output_file_name = "../data/final_tap_data_post_processing_master_list.csv"
combined_df.to_csv(output_file_name)

output_json = "../src/data/schools.json"
json_file = combined_df.to_json(orient='records',index=True) 
with open(output_json, 'w') as f:
    f.write(json_file)