# Address missing data

In [35]:
import urllib
import urllib.request
import urllib.parse
import urllib.error
import requests
from bs4 import BeautifulSoup
import csv
import os
import pandas as pd
import clean_documents
from pathlib import Path
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from functools import reduce
import extract_dates
from start import data_path

In [36]:
docs_df = pd.read_csv(os.path.join(data_path, 'doi_exemptions_scraped.csv'))

## Check TEA website for new districts of innovation

In [37]:
add_new_dois = False

In [38]:
if add_new_dois:
    url = "https://tea.texas.gov/Texas_Schools/District_Initiatives/Districts_of_Innovation/"
    webcontent = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(webcontent, 'html.parser')
    links = soup.find_all('a')
    districts_list = [i for i in map(lambda x: x.get('title'), links) 
                      if i is not None and 'ISD' in i]
    districts_df = pd.DataFrame(districts_list, columns=['title'])
    len(districts_df)
    # Make a new DataFrame from a right join on dois with list of districts
    combined_df = docs_df.merge(districts_df, how='right', left_on='title', right_on='title')

    # Save the district name and link for those districts that we don't have laws for
    missing_districts = combined_df[(combined_df['possible_laws'] == '[]')][['title', 'link']]

    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

In [39]:
if add_new_dois == False: 
    # Save the district name and link for those districts that we don't have laws for
    missing_districts = docs_df[(docs_df['possible_laws'] == '[]')][['title', 'link']]
    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

Number of missing districts:  283


In [40]:
missing_districts.head()

Unnamed: 0,title,link
5,Yoakum ISD,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
7,Wylie ISD (221912),http://www.wyliebulldogs.org/cms/One.aspx?port...
10,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...
11,Woodson ISD,https://sites.google.com/a/woodsonisd.net/wood...
14,Woden ISD,http://www.wodenisd.org/home/district-announce...


## Address districts that are missing due to incorrect link.

### Where possible, incorporate previously (manually) collected links before editing

In [41]:
missing_districts = missing_districts.rename({'link': 'link_scraped'}, axis = 'columns')
missing_links = pd.read_csv(os.path.join(data_path, 'missing_links_corrected.csv'))
missing_links = missing_links.rename({'link': 'link_manual'}, axis = 'columns')
missing_links = missing_districts.merge(missing_links, how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing links", len(missing_links))
missing_links['link'] = missing_links.link_manual
missing_links['link'] = missing_links['link'].fillna(missing_links['link_scraped'])
missing_links.head()

Number of missing links 283


Unnamed: 0.1,title,link_scraped,Unnamed: 0,link_manual,link
0,Yoakum ISD,http://www.yoakumisd.net/cms/lib3/TX01001553/C...,0.0,http://www.yoakumisd.net/cms/lib3/TX01001553/C...,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
1,Wylie ISD (221912),http://www.wyliebulldogs.org/cms/One.aspx?port...,1.0,http://www.wyliebulldogs.org/UserFiles/Servers...,http://www.wyliebulldogs.org/UserFiles/Servers...
2,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,2.0,https://core-docs.s3.amazonaws.com/documents/a...,https://core-docs.s3.amazonaws.com/documents/a...
3,Woodson ISD,https://sites.google.com/a/woodsonisd.net/wood...,3.0,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...
4,Woden ISD,http://www.wodenisd.org/home/district-announce...,4.0,https://drive.google.com/file/d/1j-lifioD9y4Z7...,https://drive.google.com/file/d/1j-lifioD9y4Z7...


In [42]:
missing_links = missing_links[['title', 'link']]
missing_links.to_csv(os.path.join(data_path, 'missing_links.csv'))

### Manually edit links in missing_links.csv where wrong. Save as missing_links_corrected.csv. Then upload.

In [43]:
missing_links = pd.read_csv(os.path.join(data_path, 'missing_links_corrected.csv'))
missing_links.tail()

Unnamed: 0.1,Unnamed: 0,title,link
272,272,Anahuac ISD,https://drive.google.com/file/d/11uTSTuAcBFTdq...
273,273,Amarillo ISD,http://www.amaisd.org/UserFiles/Servers/Server...
274,274,Alvarado ISD,http://www.alvaradoisd.net/UserFiles/Servers/S...
275,275,Alto ISD,https://resources.finalsite.net/images/v154092...
276,276,Aledo ISD,https://docs.google.com/viewerng/viewer?url=ht...


### Extract text from missing_links

In [44]:
texts = []
for link in missing_links.link:
    text = clean_documents.get_plain_text(link)
    texts.append(text)

Current link: http://www.windthorstisd.net/Uploads/50/misc/f283384.pdfqSoIFMeyfvnuo.pdfoodville_ISD_DOI_Final_Plan.pdfAREAS%20%20OF%20%20INNOVATION.pdf

2019-06-29 17:31:55,456 [MainThread  ] [WARNI]  Tika server returned status: 422


error: HTTP Error 404: Not Found link: http://www.windthorstisd.net/Uploads/50/misc/f283384.pdf
Google Error: 'd' is not in listle.com/drive/folders/0B-ALJgmFKVDFZmxBb09rRE9aQ1Uiewategories/Documents/District_of_Innovation_Timeline_&_Plan16_PDF.pdfll%201842.pdfExemptions-BoardApproved.pdf
Google Error: 'd' is not in listle.com/open?id=1kC4Rn_OgoK8K-s3kvUh4xBPTluPJdtpKtes/News/Documents/FINAL_DOI_PDF_FIle.pdf20Final.pdf7-2022_Adopted%2005.09.2017_Revised%2006.2018.docx
error: <urlopen error [Errno 8] nodename nor servname provided, or not known> link: http://www.mumford.k12.tx.us/TEAReports/Innovation%20Plan%2008%2024%202017_2.pdfhru%202021.pdfproved.pdf2F2018%20Website%20%2D%20Required%2FDistrict%2Dof%2DInnovation%2DPlan%2D2016%2D2017%2DBoard%2DApproved%2Epdf&parent=%2Fpersonal%2Fmarcus%5Fdavis%5Fpottsboroisd%5Forg%2FDocuments%2F2018%20Website%20%2D%20Required&slrid=58be829e%2D30c5%2D6000%2D8997%2De55bfde53779
Google Error: 'd' is not in listle.com/drive/folders/169mMXgf5g3TeRrsjYPg7fp

2019-06-29 17:36:20,799 [MainThread  ] [WARNI]  Tika server returned status: 422


error: HTTP Error 404: File Not Found link: http://www.ezzellisd.org/apps/pages/index.jsp?uREC_ID=199567&type=d&pREC_ID=431083
Google Error: 'd' is not in listle.com/drive/folders/0B3zKmpprcZY1OHRBcEw1d2VFcFEISD/district%20of%20innovation/CLEAR%20CREEK%20INDEPENDENT%20SCHOOL%20DISTRICT%20PLAN%20FINAL.pdf
Current link: https://docs.google.com/viewerng/viewer?url=https://www.aledoisd.org//cms/lib/TX02205721/Centricity/Domain/2005/DOI_Plan.pdfnnovation%20Plan%20(FINAL).pdf

In [45]:
missing_links['text'] = texts
missing_links['text'] = missing_links['text'].fillna('No text')
len(missing_links)

277

In [46]:
missing_links = clean_documents.remove_whitespace(missing_links, text_col = 'text')
missing_links.head()

Unnamed: 0.1,Unnamed: 0,title,link,text
0,0,Yoakum ISD,http://www.yoakumisd.net/cms/lib3/TX01001553/C...,DISTRICT OF INNOVATION DISTRICT OF INNOVATION ...
1,1,Wylie ISD (221912),http://www.wyliebulldogs.org/UserFiles/Servers...,District of Innovation Introduction HB 1842 wa...
2,2,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text
3,3,Woodson ISD,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...,Woodson Independent School District District o...
4,4,Woden ISD,https://drive.google.com/file/d/1j-lifioD9y4Z7...,DISTRICT OF INNOVATION PLAN Introduction The 8...


### Extract laws from missing_links

In [47]:
nlp = spacy.load('en_core_web_sm')
law_shapes = [i*'d' + '.' + j*'d' for i in range(1, 4) for j in range(3,5)]
law_shape_patterns = [[{'LIKE_NUM':True, 'SHAPE':shape}, {'ORTH':'%', 'OP':'!'}] for shape in law_shapes] # could add {'SHAPE':'§', 'OP':'*'},  and exceptions for laws in parentheses or followed by a hyphen

matcher = Matcher(nlp.vocab)
matcher.add("ExplicitLaw", None, *law_shape_patterns)

def get_matches(string):
    doc = nlp(string)
    matches = matcher(doc)
    return list(set([doc[i[1]:i[2]][0] for i in matches]))

missing_links['possible_laws'] = missing_links.text.apply(get_matches)

In [48]:
len(missing_links)

277

In [49]:
print('There are', len(missing_links[missing_links.text == 'No text']), 'documents without text (photocopies, most likely)')
print('There are', len(missing_links[missing_links.text == 'UNAVAILABLE']), 'documents which we cannot access')
print('An additional', len(missing_links[missing_links['possible_laws'].astype(str) == '[]'][(missing_links.text != 'No text') & (missing_links.text != 'UNAVAILABLE')]), 'documents have text but do not contain laws.')

There are 58 documents without text (photocopies, most likely)
There are 3 documents which we cannot access
An additional 60 documents have text but do not contain laws.


  app.launch_new_instance()


### Save now correct observations to 'missing_links_corrected'. Save districts that are still incorrect to missing_text.

In [50]:
missing_links_corrected = missing_links[missing_links['possible_laws'].astype(str) != '[]']
missing_text = missing_links[missing_links['possible_laws'].astype(str) == '[]']

In [51]:
len(missing_text)

121

In [52]:
print('We have added', len(missing_links_corrected), 'districts.')
print(len(missing_text), 'are still missing.')

We have added 156 districts.
121 are still missing.


## Address districts that are missing due to missing or incorrect text.

### Where possible, incorporate previously (manually) collected text before editing

In [53]:
missing_text_new = missing_text.rename({'text': 'text_scraped'}, axis = 'columns')
missing_text_old = pd.read_csv(os.path.join(data_path, 'missing_text_corrected.csv'), encoding='latin-1')
missing_text_old = missing_links.rename({'text': 'text_manual'}, axis = 'columns')
missing_text = missing_text_new.merge(missing_text_old, how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing texts", len(missing_text))
missing_text['text'] = missing_text.text_manual
missing_text['text'] = missing_text['text'].fillna(missing_text['text_scraped'])
missing_text.to_csv(os.path.join(data_path, 'missing_text.csv'))

Number of missing texts 121


##### Manually add correct text. If no text (photocopy), put 'No text' in cell. If you cannot find the document, put 'No document found'. Save as missing_text_corrected.csv.

In [54]:
missing_texts = pd.read_csv(os.path.join(data_path, 'missing_text_corrected.csv'))

### Extract laws

In [55]:
missing_texts['possible_laws'] = missing_texts.text.apply(get_matches)

In [56]:
missing_texts.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,link,text,possible_laws
0,2,2,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text,[]
1,5,5,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...,[]
2,6,6,Windthorst ISD,http://www.windthorstisd.net/Uploads/50/misc/f...,No text,[]
3,8,8,Willis ISD,https://tx50000123.schoolwires.net//cms/lib/TX...,"2 Approved by WISD School Board April 12, 2017...",[]
4,9,9,Whitney ISD,https://www.whitney.k12.tx.us/cms/lib3/TX01001...,No text,[]


In [57]:
missing_text_corrected = missing_texts[missing_texts['possible_laws'].astype(str) != '[]']
missing_laws = missing_texts[missing_texts['possible_laws'].astype(str) == '[]']

In [58]:
print('We have added', len(missing_text_corrected), 'districts.')
print(len(missing_laws), 'are still missing.')

We have added 28 districts.
89 are still missing.


# Address districts that are photocopies

In [59]:
missing_laws.to_csv(os.path.join(data_path, 'missing_laws.csv'))

##### Manually add correct laws. If you cannot find the document, leave possible_laws blank. Save as missing_laws_corrected.csv.

#### Incorporate previous edits

In [60]:
missing_laws_old = pd.read_csv(os.path.join(data_path, 'missing_laws_corrected.csv'), encoding='latin-1')
missing_laws_old = missing_laws_old.rename({'possible_laws': 'laws_old'}, axis = 'columns')
missing_laws = missing_laws.merge(missing_laws_old[['title',  'laws_old']], how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing laws", len(missing_laws))
missing_laws['possible_laws'] = missing_laws.laws_old
missing_laws[['title', 'possible_laws','link']].to_csv(os.path.join(data_path, 'missing_laws_and_dates.csv'))
missing_laws.head()

Number of missing laws 89


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,link,text,possible_laws,laws_old
0,2,2,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text,"[25.0811, 25.0812]","[25.0811, 25.0812]"
1,5,5,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...,"[25.0811, 21.003]","[25.0811, 21.003]"
2,6,6,Windthorst ISD,http://www.windthorstisd.net/Uploads/50/misc/f...,No text,"[21.401, 25.0811]","[21.401, 25.0811]"
3,8,8,Willis ISD,https://tx50000123.schoolwires.net//cms/lib/TX...,"2 Approved by WISD School Board April 12, 2017...","[25.0811, 21.003, 21.102]","[25.0811, 21.003, 21.102]"
4,9,9,Whitney ISD,https://www.whitney.k12.tx.us/cms/lib3/TX01001...,No text,[25.0811],[25.0811]


In [61]:
missing_laws_corrected = pd.read_csv(os.path.join(data_path, 'missing_laws_corrected.csv'))
missing_laws_corrected.tail()

Unnamed: 0.1,Unnamed: 0,title,possible_laws,date,link
84,84,Beeville ISD,"[21.003, 21.044, 25.0811, 25.0812, 21.203, 21....",2017,https://s3.amazonaws.com/scschoolfiles/380/bis...
85,85,Beckville ISD,"[25.092, 25.082, 25.0811, 25.0812, 25.081, 37....",2017,https://drive.google.com/file/d/1bosBi0QjvukFz...
86,86,Aquilla ISD,"[25.0811, 25.083, 25.081, 25.082, 25.112, 25.1...",2017,https://core-docs.s3.amazonaws.com/documents/a...
87,87,Anthony ISD,"[21.003, 25.0811]",2016,http://www.anthonyisd.net/assets/aisd-district...
88,88,Amarillo ISD,"[25.0811, 25.0812, 25.081, 25.082]",2016,http://www.amaisd.org/UserFiles/Servers/Server...


# Update full dataset

In [62]:
missing_links_corrected = missing_links_corrected.set_index('title')
missing_text_corrected = missing_text_corrected.set_index('title')
missing_laws_corrected = missing_laws_corrected.set_index('title')
docs_df = docs_df.set_index('title')
docs_df = docs_df.sort_index()

In [63]:
docs_df.update(missing_links_corrected)
docs_df.update(missing_text_corrected)
docs_df.update(missing_laws_corrected)

In [64]:
print("Only missing", len(docs_df[docs_df['possible_laws'].astype(str) == '[]']), " of ", len(docs_df),  "- unable to find plans.")

Only missing 17  of  824 - unable to find plans.


# Save

In [65]:
doi_dates_and_exemptions = docs_df.to_csv(os.path.join(data_path, 'doi_exemptions_temp.csv'))