# Extract dates from text

In [1]:
import os
import pandas as pd
import extract_dates
import spacy
from start import data_path

In [2]:
docs = pd.read_csv(os.path.join(data_path, 'text_narrowed.csv'))
docs

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,level,type,link,text,p_innovation
0,1323,1323,1451,Lake Travis ISD,Second,pdf,https://www.ltisdschools.org//cms/lib/Tx018000...,LTISD Plan of Innovation LAKE TRAVIS INDEPENDE...,0.999890
1,1913,1913,2108,Zephyr ISD,Second,docx,http://zephyrisd.net/wp-content/uploads/2014/0...,Zephyr ISD District of Innovation Plan Introdu...,0.999922
2,1895,1895,2089,Zavalla ISD,Second,pdf,https://s3.amazonaws.com/scschoolfiles/1772/za...,Zavalla ISD District of Innovation Plan (HB 18...,0.999619
3,577,577,596,Zapata County ISD,Second,pdf,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...,Microsoft Word - Document1 ZAPATA COUNTY INDEP...,0.452118
4,1784,1784,1967,Yorktown ISD,Second,pdf,http://www.yisd.org/userfiles/57/my%20files/fi...,Yorktown Independent School District Final Dis...,0.999877
5,163,163,163,Yoakum ISD,First,pdf,http://www.yoakumisd.net/cms/lib3/TX01001553/C...,UNAVAILABLE,0.005025
6,2173,2173,2410,Yantis ISD,Second,pdf,http://www.yantisisd.net/users/2017-2018/Distr...,Yantis ISD District of Innovation Plan (HB1842...,0.998937
7,3713,3713,109,Wylie ISD (221912),html,html,http://www.wyliebulldogs.org/cms/One.aspx?port...,District of Innovation - Wylie Independent Sch...,0.001331
8,32,32,32,Wylie ISD (043914),First,pdf,http://www.wylieisd.net/cms/lib09/TX01918453/C...,Wylie ISD INNOVATION PLAN INTRODUCTION House B...,0.999891
9,1659,1659,1828,Wortham ISD,Second,docx,https://s3.amazonaws.com/scschoolfiles/888/wis...,W Wortham ISD District of Innovation Plan 2017...,0.997949


## Term Dates
Identify most likely term date phrase from text using classifier. Extract earliest year (and month, if available) from that phrase. 

In [3]:
output_dir = os.path.join(data_path, 'date_term_classifier')
nlp = spacy.load(output_dir)

In [4]:
start_dates = []
date_phrases = []
p_terms = []
months = []
for text in docs.text:
    year, month, phrase, p = extract_dates.get_term_date_and_phrase(text, output_dir)
    start_dates.append(year)
    months.append(month)
    date_phrases.append(phrase)        
    p_terms.append(p)

In [5]:
docs['term_year'] = start_dates
docs['term_month'] = months
docs['term_phrase'] = date_phrases
docs['term_p'] = p_terms

## Finalize plan date
Identify most likely finalize date phrase from text using classifier and use latest (in terms of both year and month) as the date of finalization. 

In [6]:
output_dir = os.path.join(data_path, 'date_finalize_classifier')
nlp = spacy.load(output_dir)

In [7]:
finalize_years = []
finalize_months = []
finalize_phrases = []
finalize_p = []
for text in docs.text:
    year, month, phrase, p  = extract_dates.get_finalize_month_year_phrase(text, output_dir)
    finalize_years.append(year)
    finalize_months.append(month)
    finalize_phrases.append(phrase)
    finalize_p.append(p)

In [8]:
docs['finalize_year'] = finalize_years
docs['finalize_month'] = finalize_months
docs['finalize_phrase'] = finalize_phrases
docs['finalize_p'] = finalize_p

# Create year and month column
For now, term is prioritized. If there is a likely term year (> .9 probability) use that year as the term year. If there is a month, extract that as well. If there is no month, we assume the term is referring to an academic year. 

If there is no term phrase, use the finalize phrase instead.

In [9]:
docs.loc[docs.term_p >= .9, 'date_year'] = docs.term_year
docs.loc[docs.term_p >= .9, 'date_month'] = docs.term_month
docs.loc[docs.term_p >= .9, 'date_phrase'] = docs.term_phrase

docs.loc[docs.term_p < .9, 'date_year'] = docs.finalize_year
docs.loc[docs.term_p < .9, 'date_month'] = docs.finalize_month
docs.loc[docs.term_p < .9, 'date_phrase'] = docs.finalize_phrase

docs.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,level,type,link,text,p_innovation,term_year,term_month,term_phrase,term_p,finalize_year,finalize_month,finalize_phrase,finalize_p,date_year,date_month,date_phrase
0,1323,1323,1451,Lake Travis ISD,Second,pdf,https://www.ltisdschools.org//cms/lib/Tx018000...,LTISD Plan of Innovation LAKE TRAVIS INDEPENDE...,0.99989,2017,"(999, )",: 1. Beginning with the 2017-2018 academic yea...,0.999955,2016,December,"hold a public meeting on December 13, 2016 to ...",0.996723,2017.0,"(999, )",: 1. Beginning with the 2017-2018 academic yea...
1,1913,1913,2108,Zephyr ISD,Second,docx,http://zephyrisd.net/wp-content/uploads/2014/0...,Zephyr ISD District of Innovation Plan Introdu...,0.999922,2018,"(2018, September)",Term The District of Innovation Plan will beco...,0.999955,2018,April,"go to the Board on April 16th, 2018. Term The ...",0.889274,2018.0,"(2018, September)",Term The District of Innovation Plan will beco...
2,1895,1895,2089,Zavalla ISD,Second,pdf,https://s3.amazonaws.com/scschoolfiles/1772/za...,Zavalla ISD District of Innovation Plan (HB 18...,0.999619,2017,"(999, )",This plan will be in effect for the 2017-2018 ...,0.999955,2017,February,"27,2017 Final version plan posted January 27, ...",0.999955,2017.0,"(999, )",This plan will be in effect for the 2017-2018 ...
3,577,577,596,Zapata County ISD,Second,pdf,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...,Microsoft Word - Document1 ZAPATA COUNTY INDEP...,0.452118,2017,"(2017, December)","a District of Innovation on Tuesday, December ...",0.000187,2017,December,"a District of Innovation on Tuesday, December ...",0.999884,2017.0,December,"a District of Innovation on Tuesday, December ..."
4,1784,1784,1967,Yorktown ISD,Second,pdf,http://www.yisd.org/userfiles/57/my%20files/fi...,Yorktown Independent School District Final Dis...,0.999877,2017,"(999, )",Yorktown Independent School District Final Dis...,0.999955,2017,May,"of the letter. On May 15, 2017 the District Co...",0.972258,2017.0,"(999, )",Yorktown Independent School District Final Dis...


## Create academic year column
2017 refers to 2017-18 ay. 2016 to 2016-17 ay, etc. Standardized testing occurs generally occurs in April. So, a district will be considered 'treated' for the current academic year if their implementation occurs before April. 

In [10]:
docs['date_ay'] = docs.date_year
docs.loc[docs.date_month.isin(['January', 'February', 'March']), 'date_ay'] = docs.date_year - 1

# Save

In [11]:
docs.to_csv(os.path.join(data_path, 'doi_dates_scraped.csv'))

# Descriptives

### Export to csv to manually note whether year and month are correct.

In [12]:
test_docs = docs[docs.p_innovation > .5].sample(n = 30, random_state = 5)
test_docs['correct_month'] = ''
test_docs['correct_year'] = ''
test_docs['correct_ay'] = ''

In [13]:
test_docs.to_csv(os.path.join(data_path, 'sample_dates.csv'))

### Fill in TRUE or FALSE in correct_month and correct_year if correct month/year were extracted. If there is no year or month and one wasn't extracted, consider the month and/or year correct.

In [14]:
#TODO change test to academic year instead of month and year separately

In [15]:
test_docs_correct = pd.read_csv(os.path.join(data_path, 'sample_dates_filledin.csv'))
test_docs_correct = test_docs_correct.replace({'FALSE': 'False', 'TRUE': 'True'})

In [16]:
print('Approximately {0:.0%} percent of extracted years are correct.'.format(test_docs_correct['correct_year'].mean()))
print('Approximately {0:.0%} percent of extracted months are correct.'.format(test_docs_correct['correct_month'].mean()))
print('Approximately {0:.0%} percent of extracted academic years are correct.'.format(test_docs_correct['correct_ay'].mean()))

Approximately 93% percent of extracted years are correct.
Approximately 81% percent of extracted months are correct.
Approximately 93% percent of extracted academic years are correct.
