# Extract dates from text

In [1]:
import os
import pandas as pd
import extract_dates
import spacy
from start import data_path

In [7]:
docs = pd.read_csv(os.path.join(data_path, 'text_narrowed.csv'))

## Term Dates
Identify most likely term date phrase from text using classifier. Extract earliest year (and month, if available) from that phrase. 

In [9]:
output_dir = os.path.join(data_path, 'date_term_classifier')
nlp = spacy.load(output_dir)

In [10]:
start_dates = []
date_phrases = []
p_terms = []
months = []
for text in docs.text:
    year, month, phrase, p = extract_dates.get_term_date_and_phrase(text, output_dir)
    start_dates.append(year)
    months.append(month)
    date_phrases.append(phrase)        
    p_terms.append(p)

In [11]:
docs['term_year'] = start_dates
docs['term_month'] = months
docs['term_phrase'] = date_phrases
docs['term_p'] = p_terms

## Finalize plan date
Identify most likely finalize date phrase from text using classifier and use latest (in terms of both year and month) as the date of finalization. 

In [12]:
output_dir = os.path.join(data_path, 'date_finalize_classifier')
nlp = spacy.load(output_dir)

In [13]:
finalize_years = []
finalize_months = []
finalize_phrases = []
finalize_p = []
for text in docs.text:
    year, month, phrase, p  = extract_dates.get_finalize_month_year_phrase(text, output_dir)
    finalize_years.append(year)
    finalize_months.append(month)
    finalize_phrases.append(phrase)
    finalize_p.append(p)

In [14]:
docs['finalize_year'] = finalize_years
docs['finalize_month'] = finalize_months
docs['finalize_phrase'] = finalize_phrases
docs['finalize_p'] = finalize_p

# Create year and month column
For now, term is prioritized. If there is a likely term year (> .9 probability) use that year as the term year. If there is a month, extract that as well. If there is no month, we assume the term is referring to an academic year. 

If there is no term phrase, use the finalize phrase instead.

In [31]:
docs.loc[docs.term_p >= .9, 'date_year'] = docs.term_year
docs.loc[docs.term_p >= .9, 'date_month'] = docs.term_month
docs.loc[docs.term_p >= .9, 'date_phrase'] = docs.term_phrase

docs.loc[docs.term_p < .9, 'date_year'] = docs.finalize_year
docs.loc[docs.term_p < .9, 'date_month'] = docs.finalize_month
docs.loc[docs.term_p < .9, 'date_phrase'] = docs.finalize_phrase

docs.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,title,level,type,link,text,p_innovation,...,finalize_month,finalize_phrase,finalize_p,date_year,date_month,date_phrase,date_ay,test,test.x,test.y
0,1323,1323,1323,1451,Lake Travis ISD,Second,pdf,https://www.ltisdschools.org//cms/lib/Tx018000...,LTISD Plan of Innovation LAKE TRAVIS INDEPENDE...,0.999945,...,December,"hold a public meeting on December 13, 2016 to ...",0.996723,2017.0,,: 1. Beginning with the 2017-2018 academic yea...,2017.0,<zip object at 0x11f833048>,999,
1,1913,1913,1913,2108,Zephyr ISD,Second,docx,http://zephyrisd.net/wp-content/uploads/2014/0...,Zephyr ISD District of Innovation Plan Introdu...,0.999955,...,April,"go to the Board on April 16th, 2018. Term The ...",0.889274,2018.0,September,Term The District of Innovation Plan will beco...,2018.0,<zip object at 0x11f833048>,2018,September
2,1895,1895,1895,2089,Zavalla ISD,Second,pdf,https://s3.amazonaws.com/scschoolfiles/1772/za...,Zavalla ISD District of Innovation Plan (HB 18...,0.999955,...,February,"27,2017 Final version plan posted January 27, ...",0.999955,2017.0,,This plan will be in effect for the 2017-2018 ...,2017.0,<zip object at 0x11f833048>,999,
3,577,577,577,596,Zapata County ISD,Second,pdf,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...,Not a plan,0.249567,...,,,0.0,-999.0,,,-999.0,<zip object at 0x11f833048>,999,
4,1784,1784,1784,1967,Yorktown ISD,Second,pdf,http://www.yisd.org/userfiles/57/my%20files/fi...,Yorktown Independent School District Final Dis...,0.999955,...,May,"of the letter. On May 15, 2017 the District Co...",0.972258,2017.0,,Yorktown Independent School District Final Dis...,2017.0,<zip object at 0x11f833048>,999,


# Save

In [19]:
docs.to_csv(os.path.join(data_path, 'doi_dates_scraped.csv'))

In [20]:
len(docs)

824