# Jupyter Notebook for converting the Go Pass Data


## 1.0 Load in modules

We are using pandas and the numpy libraries to read and process the data.

Then we will use the `rapidfuzz` library to caclulate close matches to the strings so that we can join the information.

Using other popular fuzzy-string matching libraries like `fuzzball` takes over 3 hours to process as the alogrithim used takes the number of records to the `n`th power!! (i.e. comparing 1000 records processes 1000^1000!!!) `rapidfuzz` has a better algorithm that trims this town to 3 - 4 minutes!


In [37]:
import pandas as pd
import numpy as np
from datetime import datetime
from rapidfuzz import process, utils as fuzz_utils

GOOGLE_SHEET_URL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSWbwrsqF-c---4lfw0LZWymd-f8sy8sLYkXgzh0OyeGATWwrvv7V1Mq5BcApn7F_-WYKP1KXy5shKw/pub?gid=376323488&single=true&output=csv'

## 2.0 Format Go Pass Data

We will be extracting the following fields:

- District
- School
- Address
- City
- Participating

We create a simplfied field called `name` based on the `street address`, `school name`, and `district`. This field is formatted to be all lowercase with spaces replaced with `-` to make matching easier.

In [38]:
go_pass_schools = pd.read_csv(GOOGLE_SHEET_URL,
    usecols={'district_x','original_name_x','address_x','city_x','phone','email'})
go_pass_schools.columns = ["district","name","address","city","phone","email"]

go_pass_schools = go_pass_schools.fillna('')
go_pass_schools['original_name'] = go_pass_schools['name']
simple_district = go_pass_schools['district'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
# go_pass_schools['name'] = go_pass_schools['name'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')+"-"+simple_district
simple_address = go_pass_schools['address'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_name = go_pass_schools['name'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
go_pass_schools.loc[(go_pass_schools['name'].str.len() > 1),'name'] = simple_address+"-"+simple_name+"-"+simple_district
go_pass_schools


  simple_district = go_pass_schools['district'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
  simple_address = go_pass_schools['address'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
  simple_name = go_pass_schools['name'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')


Unnamed: 0,district,name,address,city,phone,email,original_name
0,Centinela Valley,4859-west-el-segundo-blvd-hawthorne-high-centi...,4859 West El Segundo Blvd.,Hawthorne,(310) 263-4400,No Data,Hawthorne High
1,Centinela Valley,14901-s-inglewood-avenue-lawndale-high-centine...,14901 S. Inglewood Avenue,Lawndale,(310) 263-3100,No Data,Lawndale High
2,Centinela Valley,4118-west-rosecrans-ave-leuzinger-high-centine...,4118 West Rosecrans Ave.,Lawndale,(310) 263-2200,No Data,Leuzinger High
3,Centinela Valley,4951-marine-ave-r-k-lloyde-high-centinela-valley,4951 Marine Ave.,Lawndale,(310) 263-3264,No Data,R. K. Lloyde High
4,Charter,"461-9th-street,-san-pedro-alliance-alice-m-bax...","461 9th Street, San Pedro",San Pedro,(310) 221-0430,,Alliance Alice M. Baxter College-Ready High Sc...
...,...,...,...,...,...,...,...
1480,Private,1253-bishops-road-cathedral-high-school-private,1253 Bishops Road,Los Angeles,(323) 441-3113,brjohnm@chsla.org,Cathedral High School
1481,Private,6361-santa-monica-blvd-episcopal-school-of-los...,6361 Santa Monica Blvd.,Los Angeles,(323) 284-7266,registrar@es-la.com,Episcopal School of Los Angeles
1482,Private,9650-zelzah-ave-northpoint-school-private,9650 Zelzah Ave.,Northridge,(818) 739-5231,,Northpoint School
1483,Santa Monica-Malibu,,,,,,


### 2.1 Format California Schools data

We create a simplfied field called `simple name` based on the `street address`, `school name`, and `district`. This field is formatted to be all lowercase with spaces replaced with `-` to make matching easier.

#### Source for California schools data

https://www.cde.ca.gov/SchoolDirectory/ExportSelect?simpleSearch=N&address=&city=&counties=&districts=&cdscode=&charter=&magnet=&name=&nps=&search=2&zip=&yearround=&status=1%2C2&types=&order=1&multilingual=&qsc=3549&qdc=3549

August 2022


In [39]:
california_schools_data = "../data/TapData.csv"

california_schools = pd.read_csv(california_schools_data)
    # usecols={'school','district','street','city','state',"zip"})
# california_schools.columns = ["school","district","address","city","state","zip"]

la_schools_df = california_schools.fillna('')
# la_schools_df

la_schools_df['original_name'] = la_schools_df['school']
la_schools_df['name'] = la_schools_df['school'].replace({'\([a-zA-Z\s\.\-\/]*\(*[a-zA-Z\s\.\-\/]*\)*[a-zA-Z\s\.\-\/]*\)$'},'',regex=True)
# la_schools_df.loc[la_schools_df['district'] == "Los Angeles Unified", 'district'] = "LAUSD"
simple_district = la_schools_df['district'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_address = la_schools_df['street'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
simple_name = la_schools_df['school'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')+"-"
la_schools_df['simple_name'] = simple_address+"-"+simple_name+"-"+simple_district


la_schools_df

  simple_district = la_schools_df['district'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
  simple_address = la_schools_df['street'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')
  simple_name = la_schools_df['school'].str.lower().str.replace(" ","-").str.replace(".","").str.replace('"','')+"-"


Unnamed: 0,rowid,school,district,street,city,state,zip,original_name,name,simple_name
0,1,Palms Elementary (ABC Unified),ABC Unified,12445 East 207th St.,Lakewood,CA,90715-1619,Palms Elementary (ABC Unified),Palms Elementary,12445-east-207th-st-palms-elementary-(abc-unif...
1,2,Ross (Faye) Middle (ABC Unified),ABC Unified,17707 Elaine Ave.,Artesia,CA,90701-4018,Ross (Faye) Middle (ABC Unified),Ross (Faye) Middle,17707-elaine-ave-ross-(faye)-middle-(abc-unifi...
2,3,Stowers(Cecil B.) Elementary (ABC Unified),ABC Unified,13350 Beach St.,Cerritos,CA,90703-1331,Stowers(Cecil B.) Elementary (ABC Unified),Stowers(Cecil B.) Elementary,13350-beach-st-stowers(cecil-b)-elementary-(ab...
3,4,Tetzlaff (Martin B.) Middle (ABC Unified),ABC Unified,12351 East Del Amo Blvd.,Cerritos,CA,90703-7635,Tetzlaff (Martin B.) Middle (ABC Unified),Tetzlaff (Martin B.) Middle,12351-east-del-amo-blvd-tetzlaff-(martin-b)-mi...
4,5,Tracy (Wilbur) High (Continuation) (ABC Unified),ABC Unified,12222 Cuesta Dr.,Cerritos,CA,90703-2801,Tracy (Wilbur) High (Continuation) (ABC Unified),Tracy (Wilbur) High (Continuation),12222-cuesta-dr-tracy-(wilbur)-high-(continuat...
...,...,...,...,...,...,...,...,...,...,...
3146,3147,YouthBuild - Norwalk (YouthBuild Charter Schoo...,YouthBuild Charter Schools of Los Angeles,12124 Front St.,Norwalk,CA,90650,YouthBuild - Norwalk (YouthBuild Charter Schoo...,YouthBuild - Norwalk,12124-front-st-youthbuild---norwalk-(youthbuil...
3147,3148,YouthBuild - Palmdale (YouthBuild Charter Scho...,YouthBuild Charter Schools of Los Angeles,38626 9th St. East,Palmdale,CA,93550,YouthBuild - Palmdale (YouthBuild Charter Scho...,YouthBuild - Palmdale,38626-9th-st-east-youthbuild---palmdale-(youth...
3148,3149,YouthBuild - Pomona (YouthBuild Charter School...,YouthBuild Charter Schools of Los Angeles,305 E Arrow Hwy.,Pomona,CA,91767,YouthBuild - Pomona (YouthBuild Charter School...,YouthBuild - Pomona,305-e-arrow-hwy-youthbuild---pomona-(youthbuil...
3149,3150,YouthBuild - South LA (YouthBuild Charter Scho...,YouthBuild Charter Schools of Los Angeles,400 West Washington Blvd.,Los Angeles,CA,90015,YouthBuild - South LA (YouthBuild Charter Scho...,YouthBuild - South LA,400-west-washington-blvd-youthbuild---south-la...


In [40]:
# unique_schools = go_pass_schools.drop_duplicates(subset=['original_name'])
# print(go_pass_schools.shape)
# print(unique_schools.shape)

# difference_in_dupes = go_pass_schools.shape[0] - unique_schools.shape[0]
# # these are the number of records with the same names
# print("Number of records with same names: \n "+str(difference_in_dupes))


## 3.0 Joining the `Original dataset` to the `California dataset`

Here we do a blanket `left` merge where the original records get data added to it.

If we want to keep the California data, then we need to switch this merge type to `inner` or `right`.

#### See the Pandas merge documentation for more information:

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge.html

In [41]:
def fuzzy_merge(baseFrame, compareFrame, baseKey, compareKey, threshold=86, limit=1, how='left'):
    s_mapping = {x: fuzz_utils.default_process(x) for x in compareFrame[compareKey]}

    m1 = baseFrame[baseKey].apply(lambda x: process.extract(
      fuzz_utils.default_process(x), s_mapping, limit=limit, score_cutoff=threshold, processor=None
    ))
    baseFrame['Match'] = m1

    m2 = baseFrame['Match'].apply(lambda x: ', '.join(i[2] for i in x))
    baseFrame['name'] = m2.replace("",np.nan)

    # baseFrame['school'] = m2['school']
    return baseFrame.merge(compareFrame, left_on='name', right_on=compareKey, how=how)

merged_df = fuzzy_merge(go_pass_schools, la_schools_df, 'name', 'simple_name',how='left')
# merged_df = fuzzy_merge(go_pass_schools, la_schools_df, 'original_name', 'name',how='left')
# merged_df = fuzzy_merge(la_schools_df, go_pass_schools, 'name', 'original_name',how='right')
merged_df


Unnamed: 0,district_x,name_x,address,city_x,phone,email,original_name_x,Match,rowid,school,district_y,street,city_y,state,zip,original_name_y,name_y,simple_name
0,Centinela Valley,4859-west-el-segundo-blvd-hawthorne-high-(cent...,4859 West El Segundo Blvd.,Hawthorne,(310) 263-4400,No Data,Hawthorne High,[(4859 west el segundo blvd hawthorne high ce...,306.0,Hawthorne High (Centinela Valley Union High),Centinela Valley Union High,4859 West El Segundo Blvd.,Hawthorne,CA,90250-4204,Hawthorne High (Centinela Valley Union High),Hawthorne High,4859-west-el-segundo-blvd-hawthorne-high-(cent...
1,Centinela Valley,,14901 S. Inglewood Avenue,Lawndale,(310) 263-3100,No Data,Lawndale High,[],,,,,,,,,,
2,Centinela Valley,4118-west-rosecrans-ave-leuzinger-high-(centin...,4118 West Rosecrans Ave.,Lawndale,(310) 263-2200,No Data,Leuzinger High,[(4118 west rosecrans ave leuzinger high cent...,308.0,Leuzinger High (Centinela Valley Union High),Centinela Valley Union High,4118 West Rosecrans Ave.,Lawndale,CA,90260-1601,Leuzinger High (Centinela Valley Union High),Leuzinger High,4118-west-rosecrans-ave-leuzinger-high-(centin...
3,Centinela Valley,4951-marine-ave-r-k-lloyde-high-(centinela-val...,4951 Marine Ave.,Lawndale,(310) 263-3264,No Data,R. K. Lloyde High,[(4951 marine ave r k lloyde high centinela v...,310.0,R. K. Lloyde High (Centinela Valley Union High),Centinela Valley Union High,4951 Marine Ave.,Lawndale,CA,90260-1251,R. K. Lloyde High (Centinela Valley Union High),R. K. Lloyde High,4951-marine-ave-r-k-lloyde-high-(centinela-val...
4,Charter,,"461 9th Street, San Pedro",San Pedro,(310) 221-0430,,Alliance Alice M. Baxter College-Ready High Sc...,[],,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,Private,,1253 Bishops Road,Los Angeles,(323) 441-3113,brjohnm@chsla.org,Cathedral High School,[],,,,,,,,,,
1481,Private,,6361 Santa Monica Blvd.,Los Angeles,(323) 284-7266,registrar@es-la.com,Episcopal School of Los Angeles,[],,,,,,,,,,
1482,Private,,9650 Zelzah Ave.,Northridge,(818) 739-5231,,Northpoint School,[],,,,,,,,,,
1483,Santa Monica-Malibu,,,,,,,[],,,,,,,,,,


## 4.0 Cleaning up of Merged Data

The steps are as follows:
1. Assign the full name the original name and the district
   - Note: you can also use the California district names instead by uncommenting the `Altenative` line
2. Process alternative method of naming where district names are only added if there are duplicates
3. Set the final `phone` column to default to the original data's `telephone` column
4. Get the match score from rapidfuzz

In [42]:
merged_df['rowid'].fillna(0, inplace=True)
merged_df['rowid'] = merged_df['rowid'].astype('int64')

In [43]:
# 1. Default name processing
# merged_df['full_name'] = merged_df['original_name_x'] + ' (' + merged_df['district_x'] + ')'
merged_df.loc[merged_df['rowid'] == 0, 'school'] = merged_df['original_name_x'] + ' (' + merged_df['district_x'] + ')'
merged_df.loc[merged_df['rowid'] == 0, 'district'] = merged_df['district_x']
merged_df['participating'] = True
# 1b. alternative approach: uncomment below to use the California district names instead
# merged_df['full_name']  = merged_df['original_name_x'] + ' (' + merged_df['district_y'] + ')'

# 2. Alternative school name field processing
# Flag duplicated records
merged_df['duped'] = merged_df.duplicated(['original_name_x'],keep=False)

# `full_name_some` is the option where there are district names for only duplicated records.
# merged_df.loc[merged_df['duped'] == False, 'full_name_some'] = merged_df['original_name_x']
# merged_df.loc[merged_df['duped'] == True, 'full_name_some'] = merged_df['original_name_x'] + ' (' + merged_df['district_y'] + ')'

# 3. Phone number processing
# merged_df['phone'] = merged_df['telephone']

# if there is less than 4 characters in that column then set it to the California's phone numbers i.e. 'default phone' column
# merged_df.loc[merged_df['phone'].str.len() < 4, 'phone'] = merged_df['default_phone']

# 4. Match score processing
merged_df['score'] = merged_df['Match'].astype('string').str.split(",").str[1]

merged_df

Unnamed: 0,district_x,name_x,address,city_x,phone,email,original_name_x,Match,rowid,school,...,city_y,state,zip,original_name_y,name_y,simple_name,district,participating,duped,score
0,Centinela Valley,4859-west-el-segundo-blvd-hawthorne-high-(cent...,4859 West El Segundo Blvd.,Hawthorne,(310) 263-4400,No Data,Hawthorne High,[(4859 west el segundo blvd hawthorne high ce...,306,Hawthorne High (Centinela Valley Union High),...,Hawthorne,CA,90250-4204,Hawthorne High (Centinela Valley Union High),Hawthorne High,4859-west-el-segundo-blvd-hawthorne-high-(cent...,,True,False,88.42105263157895
1,Centinela Valley,,14901 S. Inglewood Avenue,Lawndale,(310) 263-3100,No Data,Lawndale High,[],0,Lawndale High (Centinela Valley),...,,,,,,,Centinela Valley,True,False,
2,Centinela Valley,4118-west-rosecrans-ave-leuzinger-high-(centin...,4118 West Rosecrans Ave.,Lawndale,(310) 263-2200,No Data,Leuzinger High,[(4118 west rosecrans ave leuzinger high cent...,308,Leuzinger High (Centinela Valley Union High),...,Lawndale,CA,90260-1601,Leuzinger High (Centinela Valley Union High),Leuzinger High,4118-west-rosecrans-ave-leuzinger-high-(centin...,,True,False,88.36363636363637
3,Centinela Valley,4951-marine-ave-r-k-lloyde-high-(centinela-val...,4951 Marine Ave.,Lawndale,(310) 263-3264,No Data,R. K. Lloyde High,[(4951 marine ave r k lloyde high centinela v...,310,R. K. Lloyde High (Centinela Valley Union High),...,Lawndale,CA,90260-1251,R. K. Lloyde High (Centinela Valley Union High),R. K. Lloyde High,4951-marine-ave-r-k-lloyde-high-(centinela-val...,,True,False,88.125
4,Charter,,"461 9th Street, San Pedro",San Pedro,(310) 221-0430,,Alliance Alice M. Baxter College-Ready High Sc...,[],0,Alliance Alice M. Baxter College-Ready High Sc...,...,,,,,,,Charter,True,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,Private,,1253 Bishops Road,Los Angeles,(323) 441-3113,brjohnm@chsla.org,Cathedral High School,[],0,Cathedral High School (Private),...,,,,,,,Private,True,False,
1481,Private,,6361 Santa Monica Blvd.,Los Angeles,(323) 284-7266,registrar@es-la.com,Episcopal School of Los Angeles,[],0,Episcopal School of Los Angeles (Private),...,,,,,,,Private,True,False,
1482,Private,,9650 Zelzah Ave.,Northridge,(818) 739-5231,,Northpoint School,[],0,Northpoint School (Private),...,,,,,,,Private,True,False,
1483,Santa Monica-Malibu,,,,,,,[],0,(Santa Monica-Malibu),...,,,,,,,Santa Monica-Malibu,True,True,


## 5.0 Final Column Selection
Here we select the final data columns for our outputs, with `_x` suffixes representing the original Metro dataset and `_y` suffixes representing the California schools dataset.

- "district_x"
- "district_y"
- "original_name_x"
- "original_name_y"
- "status"
- 'closed_date'
- "full_name"
- "full_name_some"
- "participating"
- "address_x"
- "address_y"
- "city_x"
- "city_y"
- 'address_y'
- 'city_y'
- "score"
- "duped"
- 'phone'
- 'email'
- 'website'
- 'latitude'
- 'longitude'
- 'last_update'

In [44]:
# final_columns = {
#     "full_name":"school_name",
#     "full_name_some":"school_name_with_some_districts_attached"
# }

# final_df = merged_df[["oid","district_x","district_y","original_name_x","original_name_y","status",'closed_date',
#        "full_name","full_name_some","participating","address_x","address_y","city_x","city_y","score","duped",'phone', 'email',
#        'website', 'latitude', 'longitude', 'last_update']]
final_df = merged_df

# final_df["address"] = final_df["address_x"] 


# final_df.rename(inplace=True)
# final_df.reset_index(inplace=True)
final_df.index.names = ['id']

final_df

Unnamed: 0_level_0,district_x,name_x,address,city_x,phone,email,original_name_x,Match,rowid,school,...,city_y,state,zip,original_name_y,name_y,simple_name,district,participating,duped,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Centinela Valley,4859-west-el-segundo-blvd-hawthorne-high-(cent...,4859 West El Segundo Blvd.,Hawthorne,(310) 263-4400,No Data,Hawthorne High,[(4859 west el segundo blvd hawthorne high ce...,306,Hawthorne High (Centinela Valley Union High),...,Hawthorne,CA,90250-4204,Hawthorne High (Centinela Valley Union High),Hawthorne High,4859-west-el-segundo-blvd-hawthorne-high-(cent...,,True,False,88.42105263157895
1,Centinela Valley,,14901 S. Inglewood Avenue,Lawndale,(310) 263-3100,No Data,Lawndale High,[],0,Lawndale High (Centinela Valley),...,,,,,,,Centinela Valley,True,False,
2,Centinela Valley,4118-west-rosecrans-ave-leuzinger-high-(centin...,4118 West Rosecrans Ave.,Lawndale,(310) 263-2200,No Data,Leuzinger High,[(4118 west rosecrans ave leuzinger high cent...,308,Leuzinger High (Centinela Valley Union High),...,Lawndale,CA,90260-1601,Leuzinger High (Centinela Valley Union High),Leuzinger High,4118-west-rosecrans-ave-leuzinger-high-(centin...,,True,False,88.36363636363637
3,Centinela Valley,4951-marine-ave-r-k-lloyde-high-(centinela-val...,4951 Marine Ave.,Lawndale,(310) 263-3264,No Data,R. K. Lloyde High,[(4951 marine ave r k lloyde high centinela v...,310,R. K. Lloyde High (Centinela Valley Union High),...,Lawndale,CA,90260-1251,R. K. Lloyde High (Centinela Valley Union High),R. K. Lloyde High,4951-marine-ave-r-k-lloyde-high-(centinela-val...,,True,False,88.125
4,Charter,,"461 9th Street, San Pedro",San Pedro,(310) 221-0430,,Alliance Alice M. Baxter College-Ready High Sc...,[],0,Alliance Alice M. Baxter College-Ready High Sc...,...,,,,,,,Charter,True,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,Private,,1253 Bishops Road,Los Angeles,(323) 441-3113,brjohnm@chsla.org,Cathedral High School,[],0,Cathedral High School (Private),...,,,,,,,Private,True,False,
1481,Private,,6361 Santa Monica Blvd.,Los Angeles,(323) 284-7266,registrar@es-la.com,Episcopal School of Los Angeles,[],0,Episcopal School of Los Angeles (Private),...,,,,,,,Private,True,False,
1482,Private,,9650 Zelzah Ave.,Northridge,(818) 739-5231,,Northpoint School,[],0,Northpoint School (Private),...,,,,,,,Private,True,False,
1483,Santa Monica-Malibu,,,,,,,[],0,(Santa Monica-Malibu),...,,,,,,,Santa Monica-Malibu,True,True,


In [45]:
final_df.columns

Index(['district_x', 'name_x', 'address', 'city_x', 'phone', 'email',
       'original_name_x', 'Match', 'rowid', 'school', 'district_y', 'street',
       'city_y', 'state', 'zip', 'original_name_y', 'name_y', 'simple_name',
       'district', 'participating', 'duped', 'score'],
      dtype='object')

## 5.0 Final Output
Using today's date and the csv file extension we will output the file to the data directory.

We also split the data using `to_json` in pandas, more info here:

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

In [46]:
today = str(datetime.now().strftime("%Y-%m"))
outfile_extension = ".csv"
output_file_name = "../data/tap_data_merged_with_metro_california_qc_data_"+today+outfile_extension

final_df.to_csv(output_file_name)

#create JSON file oriented by records
output_json = "../src/data/schools.json"
json_file = final_df.to_json(orient='records',index=True) 
with open(output_json, 'w') as f:
    f.write(json_file)

In [47]:
# Done!