# Train classifer on date phrases to find plan start date

In [5]:
import csv
import pandas as pd
import os
import classify
import extract_dates
import random
from start import data_path

## Create training data

In [6]:
docs = pd.read_csv(os.path.join(data_path, 'text_narrowed.csv'))

### Randomly sample ten plans

Do not overwrite labelled document set unless you want to relabel. relabel_docs = False uses previously randomly sample plans.

In [7]:
relabel = False
if relabel:
    docs_to_label = docs[docs.p_innovation >.5].sample(n = 10)
    docs_to_label.index

In [8]:
if not relabel:
    docs_to_label = docs[docs.title == "Ector County ISD"]
    docs_to_label = docs_to_label.append(docs[docs.title == "Dimmitt ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Sunnyvale ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Sunray ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Douglass ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Dodd City ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Snyder ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Mart ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Rivercrest ISD"])

In [9]:
docs_to_label

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,title,level,type,link,text,p_innovation
599,328,328,328,328,Ector County ISD,First,pdf,http://www.ectorcountyisd.org/cms/lib011/tx010...,ECISD District of Innovation Plan 1 I. Introdu...,0.999729
611,3819,3819,3819,218,Dimmitt ISD,html,html,http://www.dimmittisd.net/vnews/display.v/ART/...,Not a plan,0.000618
107,2067,2067,2067,2282,Sunnyvale ISD,Second,pdf,http://www.sunnyvaleisd.com/site/handlers/file...,Sunnyvale ISD District of Innovation Plan Spri...,0.999955
106,172,172,172,172,Sunray ISD,First,pdf,https://1.cdn.edl.io/Rb3eNoIDEma2NVfd5LdmKpqLB...,Not a plan,4.5e-05
609,15,15,15,15,Douglass ISD,First,google,https://drive.google.com/file/d/1_OzEr7LnngRcQ...,Douglass ISD District of Innovation Plan (HB 1...,0.999955
610,28,28,28,28,Dodd City ISD,First,pdf,https://s3.amazonaws.com/scschoolfiles/1212/dc...,Dodd City Independent School District District...,0.999955
132,356,356,356,356,Snyder ISD,First,pdf,https://1.cdn.edl.io/7b21ktMGjSuFIU9LdTLlk0PHz...,SISD Local Innovation Plan (1) Snyder ISD Dist...,0.999955
346,178,178,178,178,Mart ISD,First,pdf,http://www.martisd.org/upload/page/0001/docs/D...,Not a plan,4.5e-05
194,197,197,197,197,Rivercrest ISD,First,pdf,http://s3.amazonaws.com/scschoolfiles/760/inno...,Not a plan,4.5e-05


## Collect phrases surrounding date match. Currently, function collects 8 words before and 6 words after but more if an entity is cut off.

In [10]:
date_phrases = []
for text in docs_to_label.text:
    phrases = extract_dates.get_phrase_list(text = text)
    for phrase in phrases:
        date_phrases.append(phrase)
date_phrases = [str(item) for item in date_phrases]

## Export to manually label. 
#### Term date refers to a phrase in the plan which specifies when the DOI plan begins and ends. Here, we refer to finalize date as any step in the DOI process (board votes, plan drafted, commissioner notified). Later,  we will treat only the latest 'finalize date' as the date the plan was finalized. 

In [11]:
phrase_df = pd.DataFrame({'text':date_phrases, 'term' : '', 'finalize': ''})
phrase_df.to_csv(os.path.join(data_path, 'dates_to_label.csv'))

## Save relabelled as dates_to_label_labelled.csv. Upload.

In [12]:
labelled = pd.read_csv(os.path.join(data_path, 'dates_to_label_labelled.csv'))
labelled = labelled.replace({'FALSE': 'False', 'TRUE': 'True'})
labelled.head()

Unnamed: 0.1,Unnamed: 0,text,term,finalize
0,0,of Innovation Plan 1 I. Introduction House Bil...,False,False
1,1,"the Texas Education Code. On February 21, 2017...",False,True
2,2,"and the community. On February 28, 2017, the B...",False,True
3,3,"March 2, 6, and 10, 2017, to discuss and draft",False,True
4,4,term of the Plan will begin with the 2017-2018...,True,False


In [13]:
phrase_df['finalize'] = list(labelled.finalize)
phrase_df['term'] = list(labelled.term)
phrase_df.head()

ValueError: Length of values does not match length of index

In [10]:
phrase_df = phrase_df.sample(frac = 1)

In [11]:
print('The first n-30 of documents are used for training. Term date training dataset statistics:')
split = int(len(phrase_df) - 30)
print(phrase_df.head(split)['term'].value_counts())

print('The last 30 of documents are used for testing. Term date testing dataset statistics:')
print(phrase_df.tail(30)['term'].value_counts())

print('The first n-30 of documents are used for training. Finalize date training dataset statistics:')
split = int(len(phrase_df) - 30)
print(phrase_df.head(split)['finalize'].value_counts())

print('The last 30 of documents are used for testing. Finalize date testing dataset statistics:')
print(phrase_df.tail(30)['finalize'].value_counts())

The first n-30 of documents are used for training. Term date training dataset statistics:
False    101
True      26
Name: term, dtype: int64
The last 30 of documents are used for testing. Term date testing dataset statistics:
False    23
True      7
Name: term, dtype: int64
The first n-30 of documents are used for training. Finalize date training dataset statistics:
True     75
False    52
Name: finalize, dtype: int64
The last 30 of documents are used for testing. Finalize date testing dataset statistics:
True     19
False    11
Name: finalize, dtype: int64


# Train term classifier

In [12]:
texts, cats = classify.reformat_cat_from_df(df = phrase_df,
                                      text_col = 'text',
                                      label_col = 'term',
                                      category = 'TERM')
output_dir = os.path.join(data_path, 'date_finalize_classifier')

In [13]:
classify.train_classifier_and_evaluate(texts=texts, 
                                        cats=cats,
                                        n_test = 30,
                                        model=None, 
                                        output_dir=output_dir, 
                                        n_iter=10, 
                                        categories=['TERM'])

Created blank 'en' model
Using 157 examples (127 training, 30 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
6.285	0.500	0.143	0.222
4.461	0.667	0.571	0.615
3.254	0.857	0.857	0.857
1.734	0.857	0.857	0.857
1.148	0.833	0.714	0.769
1.469	1.000	0.714	0.833
1.426	1.000	0.714	0.833
0.416	1.000	0.714	0.833
0.241	0.833	0.714	0.769
0.270	0.833	0.714	0.769
No text {'TERM': 4.539787187241018e-05}
Saved model to /Users/kylieleblancKylie/dofis/data/plans/date_finalize_classifier
Loading from /Users/kylieleblancKylie/dofis/data/plans/date_finalize_classifier
No text {'TERM': 4.539787187241018e-05}


<spacy.pipeline.TextCategorizer at 0x10f798048>

# Train finalize classifier

In [14]:
texts, cats = classify.reformat_cat_from_df(df = phrase_df,
                                      text_col = 'text',
                                      label_col = 'finalize',
                                      category = 'FINALIZE')
output_dir = os.path.join(data_path, 'date_finalize_classifier')

In [15]:
classify.train_classifier_and_evaluate(texts=texts, 
                                        cats=cats,
                                        n_test = 30,
                                        model=None, 
                                        output_dir=output_dir, 
                                        n_iter=10, 
                                        categories=['FINALIZE'])

Created blank 'en' model
Using 157 examples (127 training, 30 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
10.032	0.760	1.000	0.864
5.598	0.895	0.895	0.895
3.119	0.905	1.000	0.950
3.151	1.000	1.000	1.000
3.314	0.905	1.000	0.950
1.359	1.000	0.947	0.973
0.793	0.900	0.947	0.923
0.804	0.950	1.000	0.974
0.753	0.950	1.000	0.974
0.966	0.950	1.000	0.974
No text {'FINALIZE': 0.8246667981147766}
Saved model to /Users/kylieleblancKylie/dofis/data/plans/date_finalize_classifier
Loading from /Users/kylieleblancKylie/dofis/data/plans/date_finalize_classifier
No text {'FINALIZE': 0.8246667981147766}


<spacy.pipeline.TextCategorizer at 0x12272e438>