# Update DOIs

In [1]:
import urllib
import urllib.request
import urllib.parse
import urllib.error

import numpy as np
import pandas as pd
import os


import requests
from bs4 import BeautifulSoup
import csv

from pathlib import Path
import spacy

from start import data_path
import gather_documents
import clean_documents
import extract_laws
import extract_dates

In [2]:
# import cleaned exemptions list
docs_df = pd.read_csv(os.path.join(data_path,'doi_exemptions_and_dates.csv'))
print(len(docs_df))
docs_df[docs_df.title == "South San Antonio ISD"]

824


Unnamed: 0.2,Unnamed: 0,title,Unnamed: 0_x,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,p_innovation,possible_laws,Unnamed: 0_y,doi_date
693,693,South San Antonio ISD,130.0,614,614,614,640,Second,pdf,https://www.southsanisd.net//cms/lib/TX0191831...,South San Antonio Independent School District ...,0.999955,"[25.0811, 21.003, 25.112, 25.1113, 21.051, 25....",130,2017-08-01


In [3]:
# old districts with missing laws
missing_laws = docs_df[docs_df.possible_laws == '[]']
len(missing_laws)
missing_laws_list = list(missing_laws.title)
missing_laws_list

['Bronte\xa0ISD',
 'Burleson ISD',
 'Crosbyton CISD',
 'Dimmitt ISD',
 'Gregory-Portland ISD',
 'Henrietta ISD',
 'La Joya ISD',
 'Liberty Hill ISD',
 'Marathon ISD',
 'Midway ISD (161903) ']

In [4]:
# import cleaned dates

## Check TEA website for new districts of innovation

In [5]:
url = "https://tea.texas.gov/Texas_Schools/District_Initiatives/Districts_of_Innovation/"
webcontent = urllib.request.urlopen(url).read()
soup = BeautifulSoup(webcontent, 'html.parser')
links = soup.find_all('a')
districts_list = [i for i in map(lambda x: x.get('title'), links) 
                  if i is not None and 'ISD' in i]
print("Number of districts on DOI website", len(districts_list))

Number of districts on DOI website 899


In [6]:
new_districts_list = []
for dist in districts_list:
    if dist not in list(docs_df.title):
        new_districts_list.append(dist)

print("Number of new districts: ", len(new_districts_list))

Number of new districts:  78


In [7]:
new_districts_list = new_districts_list + missing_laws_list
new_df = pd.DataFrame(new_districts_list, columns=['title'])
print(len(new_df))

88


# Gather

In [8]:
first_level_links = gather_documents.FirstLevelLinks(url, print_interim=False)
first_level_df = first_level_links.docs_df.reset_index().rename(columns={'index': 'title'})
print(len(first_level_df))
first_level_df



  soup = BeautifulSoup(html)


355


Unnamed: 0,title,link,type
0,Denver City ISD,http://www.dcisd.org/cms/lib011/TX01917797/Cen...,docx
1,Marion ISD,http://www.marionisd.net/upload/page/0020/DofI...,docx
2,Post ISD,https://1.cdn.edl.io/I2a9qAWX4QHUGdG5HRdQP38Ja...,docx
3,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,pdf
4,Beeville ISD,https://s3.amazonaws.com/scschoolfiles/380/bis...,pdf
...,...,...,...
350,Red Lick ISD,https://drive.google.com/file/d/1frzMb5ZIGyOzk...,google
351,Roby CISD,https://docs.google.com/document/d/1H8XRcRNhFn...,google
352,Tidehaven ISD,https://drive.google.com/drive/folders/1qk4W6P...,google
353,Whitehouse ISD,https://docs.google.com/document/d/1lxtD2uHrmK...,google


In [9]:
new_df = new_df.merge(first_level_df, how = 'left')
print(len(new_df))
new_df
new_df.to_csv(os.path.join(data_path, 'update_links.csv'))

88


In [10]:
updated_df = pd.read_csv(os.path.join(data_path,'update_links_corrected.csv'))
updated_df = updated_df[['title', 'link']]

## Extract

### Clean Text

In [11]:
texts = []
for link in updated_df.link:
    if not isinstance(link, str):
        texts.append("no link")
    elif isinstance(link, str):
        text = clean_documents.get_plain_text(link)
        texts.append(text)
updated_df['text'] = texts
updated_df = updated_df.fillna('No text') # replace None with 'None'
updated_df

Current link: https://www.libertyhill.txed.net/domain/294ers/Server_420297/File/Henrietta%20Ind%20School%20District/District%20Information/Mandatory%20Postings/District%20of%20Innovation/District%20of%20Innovation.pdf

Unnamed: 0,title,link,text
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,Baird ISD,No text,no link
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text
...,...,...,...
81,Henrietta ISD,http://www.henrietta-isd.net/UserFiles/Servers...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
82,La Joya ISD,https://www.lajoyaisd.com/362783_3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
83,Liberty Hill ISD,https://www.libertyhill.txed.net/domain/294,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
84,Marathon ISD,No text,no link


In [12]:
updated_df = clean_documents.remove_whitespace(updated_df, text_col = 'text')
updated_df.head()

Unnamed: 0,title,link,text
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...
3,Baird ISD,No text,no link
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text


## Extract laws

In [13]:
updated_df['possible_laws'] = updated_df.text.apply(extract_laws.get_laws)
updated_df.sample(10)

Unnamed: 0,title,link,text,possible_laws
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...,"[37.0012, 25.0811, 25.036, 21.003, 21.401, 25...."
3,Baird ISD,No text,no link,[]
79,Dimmitt ISD,https://core-docs.s3.amazonaws.com/documents/a...,District of Innovation Dimmitt ISD Local Innov...,"[25.0812, 21.003, 21.401, 25.0811]"
66,San Perlita ISD,https://s3.amazonaws.com/scschoolfiles/1769/do...,Microsoft Word - DOI Plan - cover San Perlita ...,"[11.251, 21.057, 21.003, 21.053, 21.401, 25.03..."
72,Southland ISD,https://img1.wsimg.com/blobby/go/d31ac773-fbc2...,Southland Independent School District District...,"[45.206, 45.205, 45.204, 21.102, 21.003, 25.08..."
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...,[]
83,Liberty Hill ISD,https://www.libertyhill.txed.net/domain/294,District of Innovation / District of Innovatio...,[]
46,Meridian ISD,https://core-docs.s3.amazonaws.com/documents/a...,Meridian ISD District of Innovation Plan Intro...,"[37.01, 37.007, 37.105, 25.087, 21.102, 21.003..."
43,Madisonville ISD,https://4.files.edl.io/4b72/07/15/20/194259-bb...,No text,[]
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text,[]


## Extract dates

### Term

In [14]:
classifier_dir = os.path.join(data_path, 'date_term_classifier')
nlp = spacy.load(classifier_dir)

In [15]:
start_dates = []
date_phrases = []
p_terms = []
months = []
for text in updated_df.text:
    year, month, phrase, p = extract_dates.get_term_date_and_phrase(text, classifier_dir)
    start_dates.append(year)
    months.append(month)
    date_phrases.append(phrase)        
    p_terms.append(p)
updated_df['term_year'] = start_dates
updated_df['term_month'] = months
updated_df['term_phrase'] = date_phrases
updated_df['term_p'] = p_terms

In [16]:
updated_df

Unnamed: 0,title,link,text,possible_laws,term_year,term_month,term_phrase,term_p
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...,"[37.0012, 25.0811, 25.036, 21.003, 21.401, 25....",2020,August,"is for five years, beginning August, 2020 and ...",0.999955
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text,[],-999,,,0.000000
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...,[],2019,,"of Innovation Plan 2019-2024 House Bill 1842, ...",0.999630
3,Baird ISD,No text,no link,[],-999,,,0.000000
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text,[],-999,,,0.000000
...,...,...,...,...,...,...,...,...
81,Henrietta ISD,http://www.henrietta-isd.net/UserFiles/Servers...,District of Innovation Resolution Board of Edu...,[],7515,,Assistant Secretary 720-7910(940) 720-7900 Fax...,0.595147
82,La Joya ISD,https://www.lajoyaisd.com/362783_3,La Joya ISD - Curriculum and Evaluation Skip t...,[],-999,,AEIS Public Hearing Report 2010-2011 AEIS Publ...,0.999955
83,Liberty Hill ISD,https://www.libertyhill.txed.net/domain/294,District of Innovation / District of Innovatio...,[],2020,,Updated) | Terms of Use Copyright © 2002-2020 ...,0.999955
84,Marathon ISD,No text,no link,[],-999,,,0.000000


## finalize date

In [17]:
finalize_classifier = os.path.join(data_path, 'date_finalize_classifier')
nlp = spacy.load(finalize_classifier)

In [18]:
test = extract_dates.get_finalize_month_year_phrase('We will hold a school board meeting on May 5, 1991', finalize_classifier)
test

(-999, '', '', 0)

In [19]:
finalize_years = []
finalize_months = []
finalize_phrases = []
finalize_p = []
for text in updated_df.text:
    year, month, phrase, p  = extract_dates.get_finalize_month_year_phrase(text, finalize_classifier)
    finalize_years.append(year)
    finalize_months.append(month)
    finalize_phrases.append(phrase)
    finalize_p.append(p)
updated_df['finalize_year'] = finalize_years
updated_df['finalize_month'] = finalize_months
updated_df['finalize_phrase'] = finalize_phrases
updated_df['finalize_p'] = finalize_p

In [20]:
updated_df

Unnamed: 0,title,link,text,possible_laws,term_year,term_month,term_phrase,term_p,finalize_year,finalize_month,finalize_phrase,finalize_p
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...,"[37.0012, 25.0811, 25.036, 21.003, 21.401, 25....",2020,August,"is for five years, beginning August, 2020 and ...",0.999955,-999,,for all allowable TEC requirements under the H...,0.966373
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text,[],-999,,,0.000000,-999,,,0.000000
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...,[],2019,,"of Innovation Plan 2019-2024 House Bill 1842, ...",0.999630,2019,,"a period of 5 years, from August 2019 to July ...",0.999716
3,Baird ISD,No text,no link,[],-999,,,0.000000,-999,,,0.000000
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text,[],-999,,,0.000000,-999,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
81,Henrietta ISD,http://www.henrietta-isd.net/UserFiles/Servers...,District of Innovation Resolution Board of Edu...,[],7515,,Assistant Secretary 720-7910(940) 720-7900 Fax...,0.595147,2016,December,". Adopted this 8th day of December, 2016, by t...",0.998921
82,La Joya ISD,https://www.lajoyaisd.com/362783_3,La Joya ISD - Curriculum and Evaluation Skip t...,[],-999,,AEIS Public Hearing Report 2010-2011 AEIS Publ...,0.999955,2018,,2008-2009 School Report Cards Report Cards 201...,0.981234
83,Liberty Hill ISD,https://www.libertyhill.txed.net/domain/294,District of Innovation / District of Innovatio...,[],2020,,Updated) | Terms of Use Copyright © 2002-2020 ...,0.999955,2017,February,"public hearing was held on February 1, 2017 to...",0.999955
84,Marathon ISD,No text,no link,[],-999,,,0.000000,-999,,,0.000000


In [21]:
updated_df.loc[updated_df.term_p >= .9, 'date_year'] = updated_df.term_year
updated_df.loc[updated_df.term_p >= .9, 'date_month'] = updated_df.term_month
updated_df.loc[updated_df.term_p >= .9, 'date_phrase'] = updated_df.term_phrase

updated_df.loc[updated_df.term_p < .9, 'date_year'] = updated_df.finalize_year
updated_df.loc[updated_df.term_p < .9, 'date_month'] = updated_df.finalize_month
updated_df.loc[updated_df.term_p < .9, 'date_phrase'] = updated_df.finalize_phrase

updated_df.head()

Unnamed: 0,title,link,text,possible_laws,term_year,term_month,term_phrase,term_p,finalize_year,finalize_month,finalize_phrase,finalize_p,date_year,date_month,date_phrase
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...,"[37.0012, 25.0811, 25.036, 21.003, 21.401, 25....",2020,August,"is for five years, beginning August, 2020 and ...",0.999955,-999,,for all allowable TEC requirements under the H...,0.966373,2020.0,August,"is for five years, beginning August, 2020 and ..."
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text,[],-999,,,0.0,-999,,,0.0,-999.0,,
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...,[],2019,,"of Innovation Plan 2019-2024 House Bill 1842, ...",0.99963,2019,,"a period of 5 years, from August 2019 to July ...",0.999716,2019.0,,"of Innovation Plan 2019-2024 House Bill 1842, ..."
3,Baird ISD,No text,no link,[],-999,,,0.0,-999,,,0.0,-999.0,,
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text,[],-999,,,0.0,-999,,,0.0,-999.0,,


In [22]:
updated_df = updated_df[['title', 'link', 'text', 'possible_laws', 'term_year', 'term_month', 'term_phrase',
                         'finalize_year', 'finalize_month', 'finalize_phrase']]

### update dates and laws

In [23]:
updated_df.to_csv(os.path.join(data_path,'update_dates_and_laws.csv'))

In [None]:
# select dates

In [None]:
fixed_dates['doi_year'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.term_year
fixed_dates.loc[pd.isnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.finalize_year
fixed_dates.sample(10)

Open update_dates_and_laws and save as update_dates_and_laws_corrected.csv. Correct term month and year, finalize month and year, and dates. If data is not available in plan, delete -999 and leave blank (for dates) or as an empty list (for laws)

# import and append

In [10]:
new_df = pd.read_csv(os.path.join(data_path,'update_dates_and_laws_corrected.csv'))
old_df = docs_df.loc[~docs_df.title.isin(missing_laws_list)]
final_df = old_df.append(new_df)
final_df.sample(5)

Unnamed: 0.2,Unnamed: 0,title,Unnamed: 0_x,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,...,p_innovation,possible_laws,Unnamed: 0_y,doi_date,term_year,term_month,term_phrase,finalize_year,finalize_month,finalize_phrase
453,453,Lometa ISD,140.0,19.0,19.0,19.0,19.0,First,google,https://drive.google.com/file/d/1DqPY5G88CnmpB...,...,0.999955,"[25.114, 25.111, 25.081, 21.102, 21.354, 21.00...",370.0,2018-01-01,,,,,,
798,798,Whitharral ISD,5.0,282.0,282.0,282.0,282.0,First,google,https://docs.google.com/document/d/1QpBqSAmJNZ...,...,0.999795,"[37.008, 25.081, 25.036, 21.003, 21.3541, 21.2...",25.0,2018-05-01,,,,,,
1,1,Abernathy ISD,822.0,820.0,820.0,820.0,878.0,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,...,0.999904,"[25.113, 37.0012, 28.214, 21.003, 21.057, 25.0...",822.0,2017-01-01,,,,,,
74,74,Van Vleck ISD,,,,,,,,https://core-docs.s3.amazonaws.com/documents/a...,...,,"[25.0811, 28.0214, 21.057, 21.053, 25.092, 21....",,,2019.0,August,"for five years, beginning August 1, 2019 and e...",2019.0,March,"January 25, 2019 Approval by VVISD Board: Marc..."
406,406,Knippa ISD,60.0,7.0,7.0,7.0,7.0,First,google,https://docs.google.com/viewer?a=v&pid=sites&s...,...,4.5e-05,"[21.003, 21.053, 21.04, 21.005, 25.0811, 25.08...",417.0,2016-08-01,,,,,,


In [25]:
final_df = old_df.append(new_df)

In [26]:
len(final_df)

900