# Apply document classifier

This script applies the trained classifier to scraped documents.

In [17]:
import pandas as pd
import os
import spacy
from spacy.util import minibatch, compounding

## Import texts

In [18]:
texts_scraped = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_scraped.csv'))
texts = texts_scraped['text']
texts.head()

0    Sands CISD District of Innovation Plan 2018-20...
1    Ricardo Independent School District Final Dist...
2                                              No text
3    Gold-Burg Independent School District District...
4    Joaquin Independent School District District o...
Name: text, dtype: object

## Import and apply model

In [19]:
# Load model
model_dir = os.path.join(os.getcwd(),  '..', '..', 'data', 'document classifier')
# apply the saved model
print("Loading from", model_dir)
nlp = spacy.load(model_dir)
categories = []
for text in texts:
    doc = nlp(text)
    categories.append(doc.cats)

Loading from /Users/kylieleblancKylie/dofis/code/exemptions/../../data/document classifier
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797

{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOV

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INN

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATI

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 

{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}


{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATI

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{

{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOV

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATIO

{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.879

{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INN

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INN

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOV

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.8797447085380554}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION':

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 4.539787187241018e-05}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.4975038766860962}
{

{'INNOVATION': 0.4975038766860962}
{'INNOVATION': 0.11815857142210007}
{'INNOVATION': 0.9999545812606812}
{'INNOVATION': 0.4975038766860962}


In [30]:
p_innovation = [x['INNOVATION'] for x in categories]

In [31]:
texts_scraped_p = texts_scraped
texts_scraped_p['p_innovation'] = p_innovation
texts_scraped_p.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,level,type,link,text,p_innovation
3731,3731,390,Kerrville ISD,html,html,https://www.kerrvilleisd.net/domain/41,District of Innovation / District of Innovatio...,0.879745
3732,3732,391,Mineola ISD,html,html,http://www.mineolaisd.net/page/District-of-Inn...,Mineola Independent School District Skip Navig...,0.497504
3733,3733,392,Leary ISD,html,html,http://www.learyisd.net/6253_1,Leary ISD - Public Info Skip to main content W...,0.118159
3734,3734,393,Goliad ISD,html,html,http://www.goliadisd.org/apps/pages/index.jsp?...,District of Innovation – Plans – Goliad Indepe...,0.999955
3735,3735,394,Anahuac ISD,html,html,https://sites.google.com/aisdpanthers.com/anah...,Anahuac ISD - District of Innovation Search th...,0.497504


In [38]:
texts_scraped_p.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,level,type,link,text,p_innovation
0,0,0,Sands CISD,First,pdf,http://sands.esc17.net/upload/page/0019/docs/S...,Sands CISD District of Innovation Plan 2018-20...,0.497504
1,1,1,Ricardo ISD,First,pdf,http://www.ricardoisd.us/UserFiles/Servers/Ser...,Ricardo Independent School District Final Dist...,4.5e-05
2,2,2,Stanton ISD,First,pdf,http://www.stanton.esc18.net/site/handlers/fil...,No text,0.118159
3,3,3,Gold-Burg ISD,First,pdf,http://images.pcmac.org/SiSFiles/Schools/TX/Go...,Gold-Burg Independent School District District...,0.497504
4,4,4,Joaquin ISD,First,pdf,http://www.joaquinisd.net/upload/page/0025/Joa...,Joaquin Independent School District District o...,0.497504


In [39]:
nlp(texts_scraped_p.head()['text'][1]).cats

{'INNOVATION': 4.539787187241018e-05}

In [22]:
texts_scraped_p.to_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_scraped_p.csv'))

## Descriptive statistics

In [23]:
print("Predicted probability of DOI plan for first level documents:")
texts_scraped_p[['p_innovation']][texts_scraped_p.level == 'First'].mean()

Predicted probability of DOI plan for first level documents:


p_innovation    0.465132
dtype: float64

In [24]:
print("Predicted probability of DOI plan for second level and HTML documents:")
texts_scraped_p[['p_innovation']][texts_scraped_p.level != 'First'].mean()

Predicted probability of DOI plan for second level and HTML documents:


p_innovation    0.481242
dtype: float64

# Narrow Documents

Narrow documents to most likely DOI plan candidates. 

In [25]:
text_narrowed = texts_scraped_p.sort_values(['title', 'p_innovation'], ascending=False).drop_duplicates(subset=["title"], keep="first")

In [26]:
text_narrowed.to_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_narrowed.csv'))

## Test performance

sample 30, label, what p are true doi plans?

In [27]:
retest_performance = False

In [28]:
if retest_performance:
    test = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_narrowed.csv'))
    test = test.sample(n=30)

In [29]:
test.head(30)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,level,type,link,text,p_innovation
117,213,213,213,Spring Hill ISD,First,pdf,http://shisd.net/documents/announcements/2017d...,Spring Hill ISD District of Innovation Plan 20...,0.880113
266,514,514,514,North Lamar ISD,Second,pdf,/upload/page/0017/docs/district/HS%20Sum.pdf,UNAVAILABLE,0.498376
349,781,781,781,Lovelady ISD,Second,pdf,http://www.loveladyisd.net/files/user/3/file/L...,LISD_Child_Nutrition_Discrimination_Complaint_...,0.999955
43,206,206,206,Wellington ISD,First,pdf,https://s3.amazonaws.com/scschoolfiles/1189/di...,WELLINGTON INDEPENDENT SCHOOL DISTRICT Distric...,0.880113
235,5,5,5,Pecos-Barstow-Toyah ISD,First,pdf,http://www.pbtisd.esc18.net/upload/page/0013/d...,PECOS­BARSTOW­TOYAH INDEPENDENT SCHOOL DISTRIC...,0.498376
455,1080,1080,1080,Holliday ISD,Second,pdf,https://s3.amazonaws.com/scschoolfiles/1648/ho...,Holliday ISD District of Innovation Plan I. Te...,0.999955
290,1113,1113,1113,Mt. Vernon ISD,Second,pdf,http://www.mtvernonisd.net/UserFiles/Servers/S...,Microsoft Word - 2016-2017 TAPR - Public Notic...,0.498376
80,338,338,338,Tioga ISD,First,pdf,http://www.tiogaisd.net/vimages/shared/vnews/s...,Tioga ISD DISTRICT OF INNOVATION Current Schoo...,0.498376
164,91,91,91,Sabinal ISD,First,pdf,http://www.sabinalstorage.tech/requiredposting...,Microsoft Word - LIP 2016-2017 (2) (1).docx SA...,0.999955
770,3406,3406,56,Alief ISD,html,html,http://www.aliefisd.net/Page/8915,Alief District of Innovation / District of Inn...,0.999955


In [8]:
# Make first 30 documents easy to read.
for i, v in enumerate(test.loc[:,:]['text'][0:30]):
    print('\n')
    print('-'*30)
    print('DISTRICT {}'.format(i))
    print('-'*30)
    print('\n')
    print(v[0:4000])



------------------------------
DISTRICT 0
------------------------------


District of Innovation / District of Innovation Skip to Main Content District Home Select a School... Select a School Lake Travis High School Lake Travis Middle School Hudson Bend Middle School Bee Cave Elementary School Lake Travis Elementary School Lake Pointe Elementary School Lakeway Elementary School Serene Hills Elementary School West Cypress Hills Elementary School Sign In Search Our Site Search About " Accountability Bell Schedules Calendars Connect With Us District Improvement Plan District of Innovation District Profile Legislative Priorities Mission & Goals Organizational Chart Staff Directory Leadership " Board of Trustees Superintendent Departments " Athletics Business & Financial Services Communications, Media & Community Relations Learning & Teaching Services Development & Corporate Relations Facilities & Construction Fine Arts & Academic Enrichment Food and Nutrition Services Health & Social Em

In [None]:
ground_truth = [
    False, False, True, False, True,
    True, False, True
    True, True, False, False,
    True, False, True, False, True,
    True, True, False, False, True,
    True, False, False, False, False,
    False, True, True, True, True,
    True
    
]