# Update DOIs

In [1]:
import urllib
import urllib.request
import urllib.parse
import urllib.error

import numpy as np
import pandas as pd
import os


import requests
from bs4 import BeautifulSoup
import csv

from pathlib import Path
import spacy

from start import data_path
import gather_documents
import clean_documents
import extract_laws
import extract_dates

In [2]:
# import cleaned exemptions list
docs_df = pd.read_csv(os.path.join(data_path,'doi_exemptions_list.csv'))
print(len(docs_df))
docs_df[docs_df.title == "South San Antonio ISD"]

824


Unnamed: 0.2,title,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,p_innovation,possible_laws
693,South San Antonio ISD,130.0,614,614,614,640,Second,pdf,https://www.southsanisd.net//cms/lib/TX0191831...,South San Antonio Independent School District ...,0.999955,"[25.0811, 21.003, 25.112, 25.1113, 21.051, 25...."


In [3]:
# old districts with missing laws
missing_laws = docs_df[docs_df.possible_laws == '[]']
len(missing_laws)
missing_laws_list = list(missing_laws.title)
missing_laws_list

['Bronte\xa0ISD',
 'Burleson ISD',
 'Crosbyton CISD',
 'Dimmitt ISD',
 'Gregory-Portland ISD',
 'Henrietta ISD',
 'La Joya ISD',
 'Liberty Hill ISD',
 'Marathon ISD',
 'Midway ISD (161903) ']

In [4]:
# import cleaned dates

## Check TEA website for new districts of innovation

In [5]:
url = "https://tea.texas.gov/Texas_Schools/District_Initiatives/Districts_of_Innovation/"
webcontent = urllib.request.urlopen(url).read()
soup = BeautifulSoup(webcontent, 'html.parser')
links = soup.find_all('a')
districts_list = [i for i in map(lambda x: x.get('title'), links) 
                  if i is not None and 'ISD' in i]
print("Number of districts on DOI website", len(districts_list))

Number of districts on DOI website 898


In [6]:
new_districts_list = []
for dist in districts_list:
    if dist not in list(docs_df.title):
        new_districts_list.append(dist)

print("Number of new districts: ", len(new_districts_list))

Number of new districts:  78


In [7]:
new_districts_list = new_districts_list + missing_laws_list
new_df = pd.DataFrame(new_districts_list, columns=['title'])
print(len(new_df))

88


# Gather

In [9]:
first_level_links = gather_documents.FirstLevelLinks(url, print_interim=False)
first_level_df = first_level_links.docs_df.reset_index().rename(columns={'index': 'title'})
print(len(first_level_df))
first_level_df



  soup = BeautifulSoup(html)


391


Unnamed: 0,title,link,type
0,Denver City ISD,http://www.dcisd.org/cms/lib011/TX01917797/Cen...,docx
1,Marion ISD,http://www.marionisd.net/upload/page/0020/DofI...,docx
2,Post ISD,https://1.cdn.edl.io/I2a9qAWX4QHUGdG5HRdQP38Ja...,docx
3,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,pdf
4,Beeville ISD,https://s3.amazonaws.com/scschoolfiles/380/bis...,pdf
...,...,...,...
386,Valentine ISD,https://drive.google.com/file/d/1xmFIMZZfvcs4u...,google
387,Vega ISD,https://drive.google.com/file/d/1ecypqBHsR1b4-...,google
388,Whitehouse ISD,https://docs.google.com/document/d/1lxtD2uHrmK...,google
389,Whitharral ISD,https://docs.google.com/document/d/1QpBqSAmJNZ...,google


In [10]:
new_df = new_df.merge(first_level_df, how = 'left')
print(len(new_df))
new_df
new_df.to_csv(os.path.join(data_path, 'update_links.csv'))

88


In [2]:
updated_df = pd.read_csv(os.path.join(data_path,'update_links_corrected.csv'))
updated_df = updated_df[['title', 'link']]

## Extract

### Clean Text

In [8]:
texts = []
for link in updated_df.link:
    if not isinstance(link, str):
        texts.append("no link")
    elif isinstance(link, str):
        text = clean_documents.get_plain_text(link)
        texts.append(text)
updated_df['text'] = texts
updated_df = updated_df.fillna('No text') # replace None with 'None'
updated_df

Current link: https://www.libertyhill.txed.net/domain/294ers/Server_420297/File/Henrietta%20Ind%20School%20District/District%20Information/Mandatory%20Postings/District%20of%20Innovation/District%20of%20Innovation.pdf

Unnamed: 0,title,link,text
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,Baird ISD,,no link
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,
...,...,...,...
81,Henrietta ISD,http://www.henrietta-isd.net/UserFiles/Servers...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
82,La Joya ISD,https://www.lajoyaisd.com/362783_3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
83,Liberty Hill ISD,https://www.libertyhill.txed.net/domain/294,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
84,Marathon ISD,,no link


In [11]:
updated_df = clean_documents.remove_whitespace(updated_df, text_col = 'text')
updated_df.head()

Unnamed: 0,title,link,text
0,Alba-Golden ISD,https://core-docs.s3.amazonaws.com/documents/a...,ALBA-GOLDEN ISD District of Innovation Plan In...
1,Amherst ISD,https://irp-cdn.multiscreensite.com/c65082d6/f...,No text
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...
3,Baird ISD,No text,no link
4,Bartlett ISD,http://www.bartlett.txed.net/UserFiles/Servers...,No text


## Extract laws

In [13]:
updated_df['possible_laws'] = updated_df.text.apply(extract_laws.get_laws)
updated_df.sample(10)

Unnamed: 0,title,link,text,possible_laws
5,Bellville ISD,http://www.bellvilleisd.org/UserFiles/Servers/...,Bellville ISD Bellville ISD INNOVATION PLAN Au...,"[11.252, 11.251, 11.253, 25.111, 25.036, 21.00..."
51,Plains ISD,http://plainsisd.ss8.sharpschool.com/UserFiles...,SKM_C45819042310060,[]
43,Madisonville ISD,https://4.files.edl.io/4b72/07/15/20/194259-bb...,No text,[]
82,La Joya ISD,https://www.lajoyaisd.com/362783_3,La Joya ISD - Curriculum and Evaluation Skip t...,[]
63,San Benito CISD,https://4.files.edl.io/8647/04/12/19/202300-83...,SAN BENITO CISD DISTRICT OF INNOVATION PLAN Ap...,"[25.0811, 21.102]"
2,Anderson-Shiro CISD,https://drive.google.com/file/d/1MkjdvYCX6GN6l...,Anderson-Shiro CISD District of Innovation Pla...,[]
33,Ingleside ISD,https://core-docs.s3.amazonaws.com/documents/a...,Microsoft Word - District of Innovation Ingles...,"[37.007, 37.0012, 25.0811, 44.902, 25.036, 21...."
21,Driscoll ISD,http://images.pcmac.org/Uploads/DriscollISD/Dr...,Driscoll Independent School District Driscoll ...,"[25.113, 37.0012, 37.008, 11.251, 11.253, 45.2..."
50,Orange Grove ISD,https://4.files.edl.io/2fe3/04/01/19/135556-fa...,No text,[]
71,Somerset ISD,https://4.files.edl.io/1c8f/04/15/20/141606-5c...,Somerset ISD DOI Plan 2020-2025 1 | P a g e SO...,"[45.206, 45.205, 45.204, 28.0214, 21.102, 21.0..."


## Extract dates

### Term

In [2]:
classifier_dir = os.path.join(data_path, 'date_term_classifier')
nlp = spacy.load(classifier_dir)

error: bad escape \p at position 275

In [16]:
classifier_dir

'../../data/plans/date_term_classifier'