# Address missing data

In [106]:
import urllib
import urllib.request
import urllib.parse
import urllib.error
import requests
from bs4 import BeautifulSoup
import csv
import os
import pandas as pd
import clean_documents
from pathlib import Path
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from functools import reduce

In [107]:
docs_df = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'exemptions_list.csv'))

## Check TEA website for new districts of innovation

In [108]:
add_new_dois = True

In [109]:
if add_new_dois:
    url = "https://tea.texas.gov/Texas_Schools/District_Initiatives/Districts_of_Innovation/"
    webcontent = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(webcontent, 'html.parser')
    links = soup.find_all('a')
    districts_list = [i for i in map(lambda x: x.get('title'), links) 
                      if i is not None and 'ISD' in i]
    districts_df = pd.DataFrame(districts_list, columns=['title'])
    len(districts_df)
    # Make a new DataFrame from a right join on dois with list of districts
    combined_df = docs_df.merge(districts_df, how='right', left_on='title', right_on='title')

    # Save the district name and link for those districts that we don't have laws for
    missing_districts = combined_df[(combined_df['p_innovation'].isna()) |
                       (combined_df['possible_laws'] == '[]')][['title', 'link']]

    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

Number of missing districts:  147


In [110]:
if add_new_dois == False: 
    # Save the district name and link for those districts that we don't have laws for
    missing_districts = docs_df[(docs_df['p_innovation'].isna()) |
                       (docs_df['possible_laws'] == '[]')][['title', 'link']]
    print("Number of missing districts: ", len(missing_districts))
    missing_districts.head()

## Address districts that are missing due to incorrect link.

### Where possible, incorporate previously (manually) collected links before editing

In [111]:
missing_districts = missing_districts.rename({'link': 'link_scraped'}, axis = 'columns')
missing_links = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'links_nolaws_corrected.csv'))
missing_links = missing_links.rename({'link': 'link_manual'}, axis = 'columns')
missing_links = missing_districts.merge(missing_links, how = 'left', left_on = 'title', right_on = 'title')
print("Number of missing links", len(missing_links))
missing_links['link'] = missing_links.link_manual
missing_links['link'] = missing_links['link'].fillna(missing_links['link_scraped'])
missing_links.head()

Number of missing links 147


Unnamed: 0.1,title,link_scraped,Unnamed: 0,link_manual,link
0,Wylie ISD (221912),http://www.wyliebulldogs.org/cms/One.aspx?port...,0,http://www.wyliebulldogs.org/cms/One.aspx?port...,http://www.wyliebulldogs.org/cms/One.aspx?port...
1,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,1,https://core-docs.s3.amazonaws.com/documents/a...,https://core-docs.s3.amazonaws.com/documents/a...
2,Woodson ISD,https://sites.google.com/a/woodsonisd.net/wood...,2,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...
3,Woden ISD,http://www.wodenisd.org/home/district-announce...,3,https://drive.google.com/file/d/1j-lifioD9y4Z7...,https://drive.google.com/file/d/1j-lifioD9y4Z7...
4,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,4,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...


In [112]:
missing_links = missing_links[['title', 'link']]
missing_links.to_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'links_nolaws.csv'))

### Manually edit links in links_nolaws where wrong. Save as links_nolaws_corrected.csv. Then upload.

In [113]:
missing_links_corrected = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'links_nolaws_corrected.csv'))
missing_links.tail()

Unnamed: 0,title,link
142,Anahuac ISD,https://drive.google.com/file/d/11uTSTuAcBFTdq...
143,Amarillo ISD,http://www.amaisd.org/UserFiles/Servers/Server...
144,Garner ISD,https://docs.google.com/a/garnerisd.net/viewer...
145,Temple ISD,https://4.files.edl.io/6f40/08/17/18/230315-72...
146,Yantis ISD,http://www.yantisisd.net/users/2017-2018/Distr...


### Extract text from missing_links_corrected

In [114]:
texts = []
for link in missing_links_corrected.link:
    text = clean_documents.get_plain_text(link)
    texts.append(text)

Google Error: 'd' is not in listle.com/drive/folders/0B-ALJgmFKVDFZmxBb09rRE9aQ1UD%20Final%20DOI%20Plan.pdficity/Domain/6734/2017.03.16-WacoISD-DistrictofInnovation-Exemptions-BoardApproved.pdf
Google Error: 'd' is not in listle.com/open?id=1kC4Rn_OgoK8K-s3kvUh4xBPTluPJdtpKtes/News/Documents/FINAL_DOI_PDF_FIle.pdf20Final.pdf20Innovation%20Plan%20June%2012%202017.pdf
Google Error: 'd' is not in listle.com/drive/folders/169mMXgf5g3TeRrsjYPg7fpq25FI57Ce_?usp=sharing_2017.pdfe=News018-2023.pdfistrict+of+Innovation+Plan.pdf+2018.docx5Forg%2FDocuments%2F2018%20Website%20%2D%20Required%2FDistrict%2Dof%2DInnovation%2DPlan%2D2016%2D2017%2DBoard%2DApproved%2Epdf&parent=%2Fpersonal%2Fmarcus%5Fdavis%5Fpottsboroisd%5Forg%2FDocuments%2F2018%20Website%20%2D%20Required&slrid=58be829e%2D30c5%2D6000%2D8997%2De55bfde53779
error: <urlopen error [Errno 8] nodename nor servname provided, or not known> link: https://s3.amazonaws.com/scschoolfiles/759/hpisd_doi_plan_final_6-21-2017.pdf
error: <urlopen error [

In [115]:
missing_links_corrected['text'] = texts
missing_links_corrected['text'] = missing_links_corrected['text'].fillna('No text')
missing_links_corrected = clean_documents.remove_whitespace(missing_links_corrected, text_col = 'text')
missing_links_corrected.head()

Unnamed: 0.1,Unnamed: 0,title,link,text
0,0,Wylie ISD (221912),http://www.wyliebulldogs.org/cms/One.aspx?port...,District of Innovation - Wylie Independent Sch...
1,1,Woodville ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text
2,2,Woodson ISD,https://drive.google.com/file/d/0B5Y_iCwf_UMuT...,Woodson Independent School District District o...
3,3,Woden ISD,https://drive.google.com/file/d/1j-lifioD9y4Z7...,DISTRICT OF INNOVATION PLAN Introduction The 8...
4,4,Wink-Loving ISD,https://1.cdn.edl.io/vQwTZvpBxb8PX0K2KFrGbN9Y0...,Wink-Loving ISD WLISD - The Best in Texas: Bui...


### Extract laws from missing_links_corrected

In [116]:
nlp = spacy.load('en_core_web_sm')
law_shapes = [i*'d' + '.' + j*'d' for i in range(1, 4) for j in range(3,5)]
law_shape_patterns = [[{'LIKE_NUM':True, 'SHAPE':shape}, {'ORTH':'%', 'OP':'!'}] for shape in law_shapes] # could add {'SHAPE':'§', 'OP':'*'},  and exceptions for laws in parentheses or followed by a hyphen

matcher = Matcher(nlp.vocab)
matcher.add("ExplicitLaw", None, *law_shape_patterns)

def get_matches(string):
    doc = nlp(string)
    matches = matcher(doc)
    return list(set([doc[i[1]:i[2]][0] for i in matches]))

missing_links_corrected['possible_laws'] = missing_links_corrected.text.apply(get_matches)


In [117]:
print('There are', len(missing_links_corrected[missing_links_corrected.text == 'No text']), 'documents without text (photocopies, most likely)')
print('There are', len(missing_links_corrected[missing_links_corrected.text == 'UNAVAILABLE']), 'documents which we cannot access')
print('An additional', len(missing_links_corrected[missing_links_corrected['possible_laws'].astype(str) == '[]'][missing_links_corrected.text != 'No text']), 'documents have text but do not contain laws.')
missing_text.tail(10)

There are 26 documents without text (photocopies, most likely)
There are 59 documents which we cannot access
An additional 100 documents have text but do not contain laws.


  app.launch_new_instance()


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,link,text,possible_laws
106,131,131,Brenham ISD,http://bynumisd.net/UserFiles/Servers/Server_2...,No text,[]
107,133,133,Bowie ISD,http://bowie.esc11.net//cms/lib6/TX02218883/Ce...,No text,[]
108,135,135,Benjamin ISD,http://www.benjaminisd.net/vimages/shared/vnew...,No text,[]
109,136,136,Benavides ISD,http://www.benavidesisd.net/userfiles/56/my%20...,No text,[]
110,137,137,Belton ISD,http://www.bisd.net/cms/lib02/TX01001322/Centr...,No text,[]
111,138,138,Beeville ISD,https://s3.amazonaws.com/scschoolfiles/380/bis...,No text,[]
112,139,139,Beckville ISD,https://drive.google.com/file/d/1bosBi0QjvukFz...,No text,[]
113,140,140,Aquilla ISD,https://core-docs.s3.amazonaws.com/documents/a...,No text,[]
114,141,141,Anthony ISD,http://www.anthonyisd.net/assets/aisd-district...,District of Innovation A District of Innovatio...,[]
115,143,143,Amarillo ISD,http://www.amaisd.org/UserFiles/Servers/Server...,No text,[]


### Save now correct observations to 'corrected'. Save districts that are still incorrect to missing_text.

In [118]:
corrected = missing_links_corrected[missing_links_corrected['possible_laws'].astype(str) != '[]']
print('We have added', len(corrected), 'districts.')
missing_text = missing_links_corrected[missing_links_corrected['possible_laws'].astype(str) == '[]']
print(len(missing_text), 'are still missing.')

We have added 21 districts.
126 are still missing.


### Export districts with missing or incorrect text. 

In [119]:
missing_text.to_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_missing.csv'))

## Address districts that are missing due to missing or incorrect text.

### Manually add correct text. If no text (photocopy), put 'No text' in cell. If you cannot find the document, put 'No document found'. Save as missing_text_edited.csv.

In [120]:
missing_texts_corrected = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_missing_corrected.csv'))

### Extract laws

In [121]:
missing_texts_corrected['possible_laws'] = missing_texts_corrected.text.apply(get_matches)

In [122]:
print('We have added', len(missing_texts_corrected[missing_texts_corrected['possible_laws'].astype(str) != '[]']), 'districts.')
corrected = corrected.append(missing_texts_corrected[missing_texts_corrected['possible_laws'].astype(str) != '[]'])
missing_laws = missing_texts_corrected[missing_texts_corrected['possible_laws'].astype(str) == '[]']
print(len(missing_laws), 'are still missing.')

We have added 46 districts.
92 are still missing.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


### Address districts that are photocopies

In [156]:
missing_laws[missing_laws.title == 'Brenham ISD'].link.values

array(['http://bynumisd.net/UserFiles/Servers/Server_205399/File/DOIP.pdf'],
      dtype=object)

In [None]:
woodville = [25.0811, 25.0812]
wink_loving = [25.0811, 21.003]
windthorst = [21.401, 25.0811]
willis = [25.0811,21.003, 21.102]
whitney = [25.0811]
troup = [25.0811, 21.102]
trinidad = [25.0811, 21.003, 21.057, 25.081, 21.102]
sundown = [25.0811, 25.036, 21.003, 21.044]
stanton = [25.0811, 21.003, 25.092]
skidmore_tynan = [25.0811, 25.0812, 25.081, 21.003, 21.051, 21.102, 21.458]
simms = [25.0812, 21.102, 21.003, 21.053, 25.036]
shepherd = [25.0811, 37.0012, 25.111, 25.112, 21.401, 21.102]
sheldon = [25.0811, 37.0012, 25.111, 25.112, 21.401, 21.102]
seminole = [25.081, 25.0812, 21.003, 21.102, 25.082, 37.0012, 21.401, 25.036, 45.205, 45.206 ]
scurry_rosser = [25.092, 25.0811, 21.003, 21.401]
sam_rayburn = [25.111,25.112, 25.112, .21.401, 21.003, 21.102]
s_and_s = [25.0811, 21.401, 21.002, 21.102, 25.092, 25.111, 25.112, 25.113]
rice = [25.111, 25.112, 25.113, 21.003, 21.102, 37.007, 37.010, 21.401]
rankin = [25.0811, 21.003]
pottsboro = []
post = [21.003, 25.0811, 25.0812, 25.081]
paint_rock = [11.251, 11.252, 11.253, 21.003, 21.051, 21.057, 21.102, 21.404, 21.458, 55.0811, 25.0812, 25.083, 25.092, 25.112, 25.113, 25.114, 44.901, 44.903, 25.250, 25.206, 45.208, 37.005, 37.006, 37.008, 25.036]
ore_city = [25.0811, 25.0812, 21.003, 21.057, 21.102, 25.036]
olney = [21.003, 21.102, 21.401, 25.081, 25.082, 25.0811, 25.0812, 25.036]
oakwood = [21.003, 25.0811]
newcastle = [25.0811, 21.003, 21.053]
new_summerfield = [21.401, 25.0811, 21.053, 25.112, 25.113, 25.036]
motley_county = [21.401, 25.0811]
mildred = [25.0811, 25.01812, 25.111, 25.112, 21.003, 21.401, 21.203, 21.352, 37.0012, 25.092]
magnolia = [25.0811]
mabank = [25.0811, 25.112, 21.003]
lytle = []
lovelady = [25.0811, 21.003, 21.401, 25.112, 21.102, 21.352]
loraine = [25.0811, 25.08, 21.003, 21.044, 21.053, 21.057, 21.401, 11.253, 21.404, 21.451, 21.4513, 21.458]
lohn = []
lometa = [25.0811, 25.081, 21.003, 21.044, 21.053, 21.057, 21.401, 11.253, 21.404, 21.451, 21.4513, 21.458]
livingston = [25.0811, 25.081, 25.092, 21.102, 21.003, 21.352, 25.112, 25.113]  
lindale = [25.112, 25.113, 25.0811, 21.003, 21.057, 21.003, 21.057, 25.081, 25.082, 21.102, 21.404]
lefors = [21.003, 25.0811, 25.0812, 21.401]
lazbuddie = [21.002, 21.003, 25.0811, 25.0812, 21.401]
laneville = [21.003, 21.053, 21.352, 21.3541, 21.401, 21.451, 21.458, 25.0811, 25.083, 25.092, 25.113, 37.0012, 25.036, 25.081, 25.082]
lamesa = [25.0811, 25.082, 25.082, 21.003, 21.352, 21.3541, 21.203, 21.352, 25.092, 25.112, 25.113, 21.102, 37.007, 37.010]
la_grange = [25.0811]
knippa = [21.003, 21.053, 21.04, 21.005, 25.0811, 25.0812, 25.081, 21.401, 11.251, 28.004, 21.252, 21.353, 21.354]
kennard = [25.0811]
kenedy = [25.0811, 25.0812]
itasca = [25.0811, 21.003, 21.053, 21.057, 21.102, 37.0012, 21.401, 28.004, 21.352, 21.354, 21.3541]
industrial = [25.0811, 25.112, 25.113, 21.003, 21.0401, 25.082]
huntington = [25.111, 25.113, 25.0811, 25.0812, 25.081, 21.003, 25.092]
highland_park = [21.003, 21.401, 25.0811, 25.0812, 25.0812, 25.081]
higgins = [21.003, 21.053, 21.057, 25.0811, 25.081, 25.082, 21.352, 21.3541, 25.092, 25.087, 11.253, 28.004, 25.036]
henrietta = [25.0811, 25.081, 21.003, 21.044, 21.053, 21.057, 25.111, 25.112, 25.113, 25.114, 25.092, 21.401, 11.253, 21.404, 21.451, 21.4513, 21.458]
hawley = []
haskell = [25.0811, 25.081, 21.003, 21.044, 21.053, 21.057, 21.401]
harper = [25.0811, 25.0812, 25.081, 21.102, 21.003, 21.053, 21.057, 25.112, 25.113, 25.036]
harmony = [25.0811, 25.1111, 25.112, 25.113, 21.003]
gruver = [21.003, 25.0811, 25.0812, 25.111, 25.112]
groom = [21.003, 21.053, 21.057, 25.081,, 25.0811, 25.0812, 25.082, 21.401, 21.102, 21.458, 25.092, 45.205]
grapeland = [25.0811, 25.112, 21.003, 21.053, 21.057]
gause = []
fort_elliot = [25.0811, 25.0812, 21.003, 21.401]
forestburg = [25.112, 25.113, 25.0811, 25.082, 21.003]
fairfield = [25.0811, 21.102, 21.003, 21.053, 21.203, 21.351, 21.3541, 21.401]
fabens = [25.0811, 21.003, 21.053]
ezzell = []
evadale = [25.0811, 25.0812, 25.081, 25.112, 25.113, 21.003, 21.053, 21.055, 21.057, 21.102, 21.451, 21.458]
dayton = [25.111, 25.112, 25.113, 25.0811, 25.081, 21.003, 21.102]
daingerfield_lone_star = [25.0811, 21.102, 21.003, 21.053, 21.057, 25.036, 25.112]
crawford = [25.0811, 25.112, 25.113,21.203, 21.352, 21.354, 21.401, 21.003, 11.252, 21.102, 21.002]
corrigan_camden = [25.0811, 21.003, 21.002, 21.102, 25.112, 25.113, 25.081, 25.092, 21.401, 21.352, 21.3541, 21.451, 21.458, 37.0012, 28.004]
colmesneil = [25.0811, 25.0812, 25.112, 25.113, 21.003, 21.053, 21.057, 21.102, 21.451, 21.458]
cleveland = []
china_spring = [21.003, 21.053, 21.102, 25.0811, 25.112, 25.113]
childress = [21.003, 25.0811, 25.111, 25.112, 25.113]
celina = [21.003, 25.0811, 21.102, 25.112]
cayuga = [25.0811, 25.082, 25.081,  21.102,  21.401,  21.003, 21.053]
bynum = [21.003, 25.0811, 25.0812, 25.081, 25.082, 25.036 ]
bryson = [21.003, 25.0811]
bruceville_eddy = [37.0012, 25.0811, 21.003, 21.102, 28.004]
brownwood = [21.003, 25.0811]
brownfield = []
brookesmith = [25.0811, 25.0812, 21.203, 21.352, 21.354, 21.3541, 21.044, 21.003, 21.102, 25.036, 25.092, 21.401, 25.082]
bronte = [25.0811, 21.203, 21.352, 21.044, 21.003]
brenham = [21.003, 25.0812, 25.0811, 25.081, 25.082, 25.036]
bowie = [25.0811, 25.082, 21.102, 37.0012, 25.092, 21.003, 21.053, 21.057]
benjamin = [25.092, 25.0811, 25.0812, 21.102, 21.401, 25.081, 11.251, 28.004]
benavides = [25.0811, 25.0812, 21.003, 21.053, 21.401, 25.081, 25.092, 25.112, 25.113, 45.205]
belton = [21.003, 25.092, 25.0811]
beeville = [21.003, 21.044, 25.0811, 25.0812, 21.203, 21.352, 25.081, 25.092]
beckville = [25.092, 25.082, 25.0811, 25.0812, 25.081, 37.008, 37.0082, 37.005, 37.0012, 21.404, 21.003, 21.053, 21.102, 21.211]
aquilla = []
anthony = [21.003, 25.0811]
amarillo = [25.0811, 25.0812, 25.081, 25.082]