# Address missing data

In [34]:
import urllib
import urllib.request
import urllib.parse
import urllib.error
import requests
from bs4 import BeautifulSoup
import csv
import os
import pandas as pd
import clean_documents
from pathlib import Path
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from functools import reduce
from start import data_path
import extract_laws

In [35]:
docs_df = pd.read_csv(os.path.join(data_path,'doi_exemptions_scraped.csv'))

In [36]:
docs_df[docs_df.title == "Blum ISD"]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,title,level,type,link,text,p_innovation,possible_laws
757,757,351,351,351,351,Blum ISD,First,pdf,http://blumisd.net/UserFiles/Servers/Server_77...,BLUM Independent School District District of I...,0.999955,"[11.251, 21.352, 21.3541, 21.458, 21.353, 25.1..."


In [37]:
extract_laws.get_laws(docs_df[docs_df.title == 'Blum ISD']['text'].values[0])

[11.251,
 21.451,
 21.102,
 21.352,
 21.003,
 25.113,
 25.082,
 25.092,
 25.087,
 25.083,
 29.0821,
 25.112,
 28.0214,
 25.036,
 25.084,
 28.0216,
 37.0012,
 45.206,
 25.081,
 21.458,
 21.353,
 21.354,
 21.401,
 21.3541,
 25.0811]

## Check TEA website for new districts of innovation

In [38]:
add_new_dois = False

In [39]:
if add_new_dois:
    url = "https://tea.texas.gov/Texas_Schools/District_Initiatives/Districts_of_Innovation/"
    webcontent = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(webcontent, 'html.parser')
    links = soup.find_all('a')
    districts_list = [i for i in map(lambda x: x.get('title'), links) 
                      if i is not None and 'ISD' in i]
    districts_df = pd.DataFrame(districts_list, columns=['title'])
    len(districts_df)
    # Make a new DataFrame from a right join on dois with list of districts
    combined_df = docs_df.merge(districts_df, how='right', left_on='title', right_on='title')

    # Save the district name and link for those districts that we don't have laws for
    missing_districts = combined_df[(combined_df['possible_laws'] == '[]')][['title', 'link']]

    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

In [40]:
if add_new_dois == False: 
    # Save the district name and link for those districts that we don't have laws for
    missing_districts = docs_df[(docs_df['possible_laws'] == '[]')][['title', 'link']]
    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

Number of missing districts:  276


In [41]:
missing_districts.head()

Unnamed: 0,title,link
3,Zapata County ISD,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
5,Yoakum ISD,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
7,Wylie ISD (221912),http://www.wyliebulldogs.org/cms/One.aspx?port...
10,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...
11,Woodson ISD,https://sites.google.com/a/woodsonisd.net/wood...


## Address districts that are missing due to incorrect link.

### Where possible, incorporate previously (manually) collected links before editing

In [42]:
missing_districts = missing_districts.rename({'link': 'link_scraped'}, axis = 'columns')

# Import previous link fixes
missing_links = pd.read_csv(os.path.join(data_path, 'missing_links_corrected.csv'))
missing_links = missing_links.rename({'link': 'link_manual'}, axis = 'columns')

# merge old and new
missing_links = missing_districts.merge(missing_links, how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing links", len(missing_links))

# If I haven't manually fixed the link before, show the link with the highest p(doi)
missing_links['link'] = missing_links.link_manual
missing_links['link'] = missing_links['link'].fillna(missing_links['link_scraped'])
missing_links.tail(20)

Number of missing links 276


Unnamed: 0.1,title,link_scraped,Unnamed: 0,link_manual,link
256,Blue Ridge ISD,http://brisd.net/domain/355,258.0,https://docs.google.com/document/d/1jJw9_M2mC_...,https://docs.google.com/document/d/1jJw9_M2mC_...
257,Bland ISD,https://drive.google.com/file/d/0B0CSbOD5Zf1md...,259.0,https://drive.google.com/file/d/0B0CSbOD5Zf1md...,https://drive.google.com/file/d/0B0CSbOD5Zf1md...
258,Blackwell CISD,https://1.cdn.edl.io/b6KPkAODsZKMJ39nycXFAs2Gj...,260.0,https://1.cdn.edl.io/b6KPkAODsZKMJ39nycXFAs2Gj...,https://1.cdn.edl.io/b6KPkAODsZKMJ39nycXFAs2Gj...
259,Benjamin ISD,https://benjamin-isd.net/wp-content/uploads/20...,261.0,http://www.benjaminisd.net/vimages/shared/vnew...,http://www.benjaminisd.net/vimages/shared/vnew...
260,Benavides ISD,http://www.benavidesisd.net/Content2/Innovation,262.0,http://www.benavidesisd.net/userfiles/56/my%20...,http://www.benavidesisd.net/userfiles/56/my%20...
261,Belton ISD,http://www.bisd.net/cms/lib02/TX01001322/Centr...,263.0,http://www.bisd.net/cms/lib02/TX01001322/Centr...,http://www.bisd.net/cms/lib02/TX01001322/Centr...
262,Beeville ISD,https://s3.amazonaws.com/scschoolfiles/380/bis...,264.0,https://s3.amazonaws.com/scschoolfiles/380/bis...,https://s3.amazonaws.com/scschoolfiles/380/bis...
263,Beckville ISD,https://drive.google.com/embeddedfolderview?id...,265.0,https://drive.google.com/file/d/1bosBi0QjvukFz...,https://drive.google.com/file/d/1bosBi0QjvukFz...
264,Ballinger ISD,https://4.files.edl.io/0596/11/05/18/185313-05...,266.0,https://4.files.edl.io/0596/11/05/18/185313-05...,https://4.files.edl.io/0596/11/05/18/185313-05...
265,Austin ISD,https://www.austinisd.org/sites/default/files/...,267.0,https://www.austinisd.org/sites/default/files/...,https://www.austinisd.org/sites/default/files/...


In [43]:
missing_links = missing_links[['title', 'link']]
missing_links.to_csv(os.path.join(data_path, 'missing_links.csv'))

### Manually edit links in missing_links.csv where wrong. Save as missing_links_corrected.csv. Then upload.

In [44]:
missing_links = pd.read_csv(os.path.join(data_path, 'missing_links_corrected.csv'))
missing_links.tail(5)

Unnamed: 0.1,Unnamed: 0,title,link
273,273,Anahuac ISD,https://drive.google.com/file/d/11uTSTuAcBFTdq...
274,274,Amarillo ISD,http://www.amaisd.org/UserFiles/Servers/Server...
275,275,Alvarado ISD,http://www.alvaradoisd.net/UserFiles/Servers/S...
276,276,Alto ISD,https://resources.finalsite.net/images/v154092...
277,277,Aledo ISD,https://docs.google.com/viewerng/viewer?url=ht...


### Extract text from missing_links

In [45]:
texts = []
for link in missing_links.link:
    text = clean_documents.get_plain_text(link)
    texts.append(text)

Current link: http://www.windthorstisd.net/Uploads/50/misc/f283384.pdfqSoIFMeyfvnuo.pdfoodville_ISD_DOI_Final_Plan.pdfAREAS%20%20OF%20%20INNOVATION.pdf

2019-06-30 14:44:42,016 [MainThread  ] [WARNI]  Tika server returned status: 422


error: HTTP Error 404: Not Found link: http://www.windthorstisd.net/Uploads/50/misc/f283384.pdf
Google Error: 'd' is not in listle.com/drive/folders/0B-ALJgmFKVDFZmxBb09rRE9aQ1UiewA_iIXKU5g/edit0Plan.pdff%20Innovation/Innovation%20Plan%20House%20Bill%201842.pdfExemptions-BoardApproved.pdf
Google Error: 'd' is not in listle.com/open?id=1kC4Rn_OgoK8K-s3kvUh4xBPTluPJdtpKtes/News/Documents/FINAL_DOI_PDF_FIle.pdf%20Plan_2017-2022_Adopted%2005.09.2017_Revised%2006.2018.docx
Google Error: 'd' is not in listle.com/open?id=1laYeq1xQHm2ZlQxMjVbDe2V2wdcCm-qFvP75DNtJO58ataid=50425&FileName=EDGEWOOD%20ISD%20Disrtict%20of%20Innovation%2010-05-17%20FINAL.pdf139420&response-cache-control=private%2C%20max-age%3D31536000&response-content-disposition=%3Bfilename%3D%22Adopted%2520DOI%2520V2%25204-2017.pdf%22&response-content-type=application%2Fpdf&Signature=UGyLWybPWLXoHZJnUXWxZQPKFgo%3D-fac0-412d-aa4f-100af8f54f2b
Google Error: 'd' is not in listle.com/drive/folders/0B3zKmpprcZY1OHRBcEw1d2VFcFE2E.pdfr%20

In [46]:
missing_links['text'] = texts
missing_links['text'] = missing_links['text'].fillna('No text')
len(missing_links)

278

In [47]:
missing_links = clean_documents.remove_whitespace(missing_links, text_col = 'text')
missing_links.tail(25)

Unnamed: 0.1,Unnamed: 0,title,link,text
253,253,Brownwood ISD,https://tx02000872.schoolwires.net//cms/lib/TX...,BROWNWOOD INDEPENDENT SCHOOL DISTRICT 2707 Sou...
254,254,Brownfield ISD,https://s3.amazonaws.com/scschoolfiles/1457/fi...,No text
255,255,Bronte ISD,http://www.bronteisd.net/storage/UserFileFolde...,No text
256,256,Brazosport ISD,https://www.brazosportisd.net/UserFiles/Server...,Brazosport Independent School District Local I...
257,257,Bowie ISD,http://bowie.esc11.net//cms/lib6/TX02218883/Ce...,No text
258,258,Blue Ridge ISD,https://docs.google.com/document/d/1jJw9_M2mC_...,Blue Ridge ISD Innovation Plan 2016-2021 HB 18...
259,259,Bland ISD,https://drive.google.com/file/d/0B0CSbOD5Zf1md...,1 | P a g e Introduction The 84th legislature ...
260,260,Blackwell CISD,https://1.cdn.edl.io/b6KPkAODsZKMJ39nycXFAs2Gj...,SCHOOL START DATE (EB LEGAL) (Texas Education ...
261,261,Benjamin ISD,http://www.benjaminisd.net/vimages/shared/vnew...,Page not found – Benjamin ISD About Us Welcome...
262,262,Benavides ISD,http://www.benavidesisd.net/userfiles/56/my%20...,No text


### Extract laws from missing_links

In [48]:
missing_links['possible_laws'] = missing_links.text.apply(extract_laws.get_matches)

In [49]:
print('There are', len(missing_links[missing_links.text == 'No text']), 'documents without text (photocopies, most likely)')
print('There are', len(missing_links[missing_links.text == 'UNAVAILABLE']), 'documents which we cannot access')
print('An additional', len(missing_links[missing_links['possible_laws'].astype(str) == '[]'][(missing_links.text != 'No text') & (missing_links.text != 'UNAVAILABLE')]), 'documents have text but do not contain laws.')

There are 62 documents without text (photocopies, most likely)
There are 1 documents which we cannot access
An additional 67 documents have text but do not contain laws.


  app.launch_new_instance()


### Save now correct observations to 'missing_links_corrected'. Save districts that are still incorrect to missing_text.

In [50]:
missing_links_corrected = missing_links[missing_links['possible_laws'].astype(str) != '[]']
missing_text = missing_links[missing_links['possible_laws'].astype(str) == '[]']
missing_text.head()

Unnamed: 0.1,Unnamed: 0,title,link,text,possible_laws
3,3,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text,[]
6,6,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...,[]
7,7,Windthorst ISD,http://www.windthorstisd.net/Uploads/50/misc/f...,UNAVAILABLE,[]
9,9,Willis ISD,https://tx50000123.schoolwires.net//cms/lib/TX...,"2 Approved by WISD School Board April 12, 2017...",[]
10,10,Whitney ISD,https://www.whitney.k12.tx.us/cms/lib3/TX01001...,No text,[]


In [51]:
len(missing_text)

130

In [52]:
print('We have added', len(missing_links_corrected), 'districts.')
print(len(missing_text), 'are still missing.')

We have added 148 districts.
130 are still missing.


## Address districts that are missing due to missing or incorrect text.

### Where possible, incorporate previously (manually) collected text before editing

In [53]:
# save as new
missing_text_new = missing_text.rename({'text': 'text_scraped'}, axis = 'columns')
# save old
missing_text_old = pd.read_csv(os.path.join(data_path, 'missing_text_corrected.csv'), encoding='latin-1')
missing_text_old = missing_text_old.rename({'text': 'text_manual'}, axis = 'columns')

# merge new and old
missing_text = missing_text_new[['title', 'link', 'text_scraped']].merge(missing_text_old[['title', 'text_manual']], how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing texts", len(missing_text))

# set text equal to text manual
missing_text['text'] = missing_text.text_manual

# replace text to text scraped if text is missing
missing_text['text'] = missing_text['text'].fillna(missing_text['text_scraped'])

missing_text = missing_text[['title', 'link', 'text']]
missing_text.to_csv(os.path.join(data_path, 'missing_text.csv'))

Number of missing texts 130


##### Manually add correct text. If no text (photocopy), put 'No text' in cell. If you cannot find the document, put 'No document found'. Save as missing_text_corrected.csv.

In [54]:
missing_texts = pd.read_csv(os.path.join(data_path, 'missing_text_corrected.csv'))
missing_texts = clean_documents.remove_whitespace(missing_texts, 'text')
missing_texts.head(20)

Unnamed: 0.1,Unnamed: 0,title,link,text
0,0,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text
1,1,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...
2,2,Windthorst ISD,http://www.windthorstisd.net/Uploads/50/misc/f...,No text
3,3,Willis ISD,https://tx50000123.schoolwires.net//cms/lib/TX...,"2 Approved by WISD School Board April 12, 2017..."
4,4,Whitney ISD,https://www.whitney.k12.tx.us/cms/lib3/TX01001...,No text
5,5,Whitharral ISD,https://docs.google.com/document/d/1QpBqSAmJNZ...,Whitharral ISD District of Innovation Ã ¢Comm...
6,6,White Oak ISD,https://docs.google.com/document/d/1otlNz4M2pp...,A DISTRICT PLAN FOR INNOVATION & LOCAL CONTROL...
7,7,West Sabine ISD,http://www.westsabineisd.net/page/open/921/0/2...,No text
8,8,Waco ISD,https://docs.google.com/viewerng/viewer?url=ht...,WACO INDEPENDENT SCHOOL DISTRICT DISTRICT OF I...
9,9,Valentine ISD,http://www.valentineisd.com/search?orgId=14e1e...,Valentine ISD District of Innovation Plan Augu...


### Extract laws

In [55]:
missing_texts['possible_laws'] = missing_texts.text.apply(extract_laws.get_matches)

In [56]:
missing_texts.head(20)

Unnamed: 0.1,Unnamed: 0,title,link,text,possible_laws
0,0,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text,[]
1,1,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...,[]
2,2,Windthorst ISD,http://www.windthorstisd.net/Uploads/50/misc/f...,No text,[]
3,3,Willis ISD,https://tx50000123.schoolwires.net//cms/lib/TX...,"2 Approved by WISD School Board April 12, 2017...",[]
4,4,Whitney ISD,https://www.whitney.k12.tx.us/cms/lib3/TX01001...,No text,[]
5,5,Whitharral ISD,https://docs.google.com/document/d/1QpBqSAmJNZ...,Whitharral ISD District of Innovation Ã ¢Comm...,"[21.057, 21.057, 39.054, 28.004, 21.203, 21.35..."
6,6,White Oak ISD,https://docs.google.com/document/d/1otlNz4M2pp...,A DISTRICT PLAN FOR INNOVATION & LOCAL CONTROL...,"[21.003, 21.003, 21.003, 25.0811, 25.092, 22.0..."
7,7,West Sabine ISD,http://www.westsabineisd.net/page/open/921/0/2...,No text,[]
8,8,Waco ISD,https://docs.google.com/viewerng/viewer?url=ht...,WACO INDEPENDENT SCHOOL DISTRICT DISTRICT OF I...,"[25.112, 25.085, 25.0915, 25.092, 25.094, 25.0..."
9,9,Valentine ISD,http://www.valentineisd.com/search?orgId=14e1e...,Valentine ISD District of Innovation Plan Augu...,"[21.354, 21.353, 25.036, 25.036, 25.081, 21.10..."


In [57]:
missing_text_corrected = missing_texts[missing_texts['possible_laws'].astype(str) != '[]']
missing_laws = missing_texts[missing_texts['possible_laws'].astype(str) == '[]']

In [58]:
print('We have added', len(missing_text_corrected), 'districts.')
print(len(missing_laws), 'are still missing.')

We have added 39 districts.
92 are still missing.


# Address districts that are photocopies

#### Incorporate previous edits

In [59]:
# Import old manual edits
missing_laws_old = pd.read_csv(os.path.join(data_path, 'missing_laws_corrected.csv'), encoding='latin-1')
missing_laws_old = missing_laws_old.rename({'possible_laws': 'laws_old'}, axis = 'columns')

# Merge with new missing obs
missing_laws = missing_laws.merge(missing_laws_old[['title',  'laws_old']], how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing laws", len(missing_laws))

missing_laws['possible_laws'] = missing_laws.laws_old
missing_laws = missing_laws[['title', 'possible_laws','link']]
missing_laws.to_csv(os.path.join(data_path, 'missing_laws.csv'))
missing_laws.head()

Number of missing laws 92


Unnamed: 0,title,possible_laws,link
0,Woodville ISD,"[25.0811, 25.0812]",https://core-docs.s3.amazonaws.com/documents/a...
1,Wink-Loving ISD,"[25.0811, 21.003]",https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...
2,Windthorst ISD,"[21.401, 25.0811]",http://www.windthorstisd.net/Uploads/50/misc/f...
3,Willis ISD,"[25.0811, 21.003, 21.102]",https://tx50000123.schoolwires.net//cms/lib/TX...
4,Whitney ISD,[25.0811],https://www.whitney.k12.tx.us/cms/lib3/TX01001...


In [60]:
missing_laws.head()

Unnamed: 0,title,possible_laws,link
0,Woodville ISD,"[25.0811, 25.0812]",https://core-docs.s3.amazonaws.com/documents/a...
1,Wink-Loving ISD,"[25.0811, 21.003]",https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...
2,Windthorst ISD,"[21.401, 25.0811]",http://www.windthorstisd.net/Uploads/50/misc/f...
3,Willis ISD,"[25.0811, 21.003, 21.102]",https://tx50000123.schoolwires.net//cms/lib/TX...
4,Whitney ISD,[25.0811],https://www.whitney.k12.tx.us/cms/lib3/TX01001...


##### Manually add correct laws. If you cannot find the document, leave possible_laws blank. Save as missing_laws_corrected.csv.

In [61]:
missing_laws_corrected = pd.read_csv(os.path.join(data_path, 'missing_laws_corrected.csv'))
missing_laws_corrected.tail()

Unnamed: 0.1,Unnamed: 0,title,possible_laws,link
89,89,Beckville ISD,"[25.092, 25.082, 25.0811, 25.0812, 25.081, 37....",https://drive.google.com/file/d/1bosBi0QjvukFz...
90,90,Aquilla ISD,"[25.0811, 25.083, 25.081, 25.082, 25.112, 25.1...",https://core-docs.s3.amazonaws.com/documents/a...
91,91,Anthony ISD,"[21.003, 25.0811]",http://www.anthonyisd.net/assets/aisd-district...
92,92,Amarillo ISD,"[25.0811, 25.0812, 25.081, 25.082]",http://www.amaisd.org/UserFiles/Servers/Server...
93,93,Alvarado ISD,,http://www.alvaradoisd.net/UserFiles/Servers/S...


# Update full dataset

In [62]:
missing_links_corrected = missing_links_corrected.set_index('title')
missing_text_corrected = missing_text_corrected.set_index('title')
missing_laws_corrected = missing_laws_corrected.set_index('title')
docs_df = docs_df.set_index('title')
docs_df = docs_df.sort_index()

In [63]:
docs_df.update(missing_links_corrected)
docs_df.update(missing_text_corrected)
docs_df.update(missing_laws_corrected)
docs_df.sample(10)

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,p_innovation,possible_laws
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Center ISD,695.0,312,312,312,312,First,pdf,http://www.centerisd.org/upload/page/0063/docs...,Districts of Innovation may be exempt from sta...,0.999955,"[25.082, 21.057, 21.003, 25.0811, 25.081]"
Hallettsville ISD,507.0,1156,1156,1156,1266,Second,pdf,https://1.cdn.edl.io/FvsO0HQMGiiTwMHhA1rbf3N0m...,HJISD District of Innovation 2017-2021 HALLETT...,0.999955,"[21.055, 25.081, 21.458, 21.003, 25.113, 21.05..."
Lamesa ISD,46.0,1983,1983,1983,2188,Second,pdf,https://1.cdn.edl.io/8gWq2fZYq15StydveMhAEuuFu...,Not a plan,0.024562,"[25.0811, 25.082, 21.003, 21.352, 21.3541, 21...."
Douglass ISD,609.0,15,15,15,15,First,google,https://drive.google.com/file/d/1_OzEr7LnngRcQ...,Douglass ISD District of Innovation Plan (HB 1...,0.999955,"[21.352, 21.3541, 21.102, 21.203, 25.0811, 21...."
Lovelady ISD,40.0,1179,1179,1179,1292,Second,pdf,http://www.loveladyisd.net/files/user/3/file/L...,Not a plan,0.007645,"[25.0811, 21.003, 21.401, 25.112, 21.102, 21.352]"
Mansfield ISD,350.0,986,986,986,1075,Second,pdf,https://www.mansfieldisd.org/uploaded/main/abo...,Roscoe Collegiate ISD District of Innovation P...,0.999936,"[25.0811, 25.111, 21.352, 21.003, 25.113, 25.112]"
Venus ISD,60.0,2848,2848,2848,3161,Second,pdf,http://www.venusisd.net/uploads/2/6/5/9/265913...,Venus Independent School District Local Innova...,0.999903,"[25.113, 25.111, 21.352, 21.003, 21.401, 21.10..."
Calhoun County ISD,713.0,3878,3878,3878,278,html,html,http://www.calcoisd.org/common/pages/DisplayFi...,CCISD District of Innovation Plan Committee Vo...,0.999955,"[25.082, 21.057, 21.003, 21.053, 25.081, 25.0811]"
San Saba ISD,163.0,291,291,291,291,First,pdf,https://www.san-saba.net/pdf/Revised_San_Saba_...,Microsoft Word - Preliminary Revision San Saba...,0.944715,"[25.0811, 21.0003, 21.102, 21.003]"
Royse City ISD,176.0,3810,3810,3810,209,html,html,https://www.rcisd.org/doi/,District of Innovation | Royse City ISD Home D...,0.999955,"[102.1307, 25.0811, 12.0522, 28.0216, 21.401, ..."


In [64]:
for law in docs_df[docs_df.index == "Blum ISD"].possible_laws:
    print(law)

[11.251, 21.352, 21.3541, 21.458, 21.353, 25.113, 25.082, 25.0811, 25.083, 25.092, 25.036, 28.0216, 29.0821, 25.087, 28.0214, 37.0012, 45.206, 25.112, 21.451, 21.354, 21.102, 21.401, 21.003, 25.084, 25.081]


In [65]:
print("Only missing", len(docs_df[docs_df['possible_laws'].astype(str) == '[]']), " of ", len(docs_df),  "- unable to find plans.")

Only missing 13  of  824 - unable to find plans.


# Save

In [66]:
docs_df.to_csv(os.path.join(data_path, 'doi_exemptions_temp.csv'))