# Train classifer on date phrases to find plan start date

In [64]:
import csv
import pandas as pd
import os
import classify
import extract_dates
import random
from start import data_path

## Create training data

In [65]:
docs = pd.read_csv(os.path.join(data_path, 'text_narrowed.csv'))

### Randomly sample ten plans

Do not overwrite labelled document set unless you want to relabel. relabel_docs = False uses previously randomly sample plans.

In [66]:
relabel = False
if relabel:
    docs_to_label = docs[docs.p_innovation >.5].sample(n = 10)
    docs_to_label.index

In [67]:
if not relabel:
    docs_to_label = docs[docs.title == "Ector County ISD"]
    docs_to_label = docs_to_label.append(docs[docs.title == "Dimmitt ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Sunnyvale ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Sunray ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Douglass ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Dodd City ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Snyder ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Mart ISD"])
    docs_to_label = docs_to_label.append(docs[docs.title == "Rivercrest ISD"])

In [68]:
docs_to_label

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,level,type,link,text,p_innovation
568,270,270,270,Ector County ISD,First,pdf,http://www.ectorcountyisd.org/cms/lib011/tx010...,ECISD District of Innovation Plan 1 I. Introdu...,0.999667
579,3631,3631,288,Dimmitt ISD,html,html,http://www.dimmittisd.net/vnews/display.v/ART/...,Dimmitt ISD - District of Innovation Quick Lin...,0.986358
101,3331,3331,3331,Sunnyvale ISD,Second,pdf,http://www.sunnyvaleisd.com/site/handlers/file...,Sunnyvale ISD District of Innovation Plan Spri...,0.999955
100,98,98,98,Sunray ISD,First,pdf,https://1.cdn.edl.io/Rb3eNoIDEma2NVfd5LdmKpqLB...,SUNRAY ISD District of Innovation Plan 2 Distr...,0.999829
577,291,291,291,Douglass ISD,First,google,https://drive.google.com/file/d/1_OzEr7LnngRcQ...,Douglass ISD District of Innovation Plan (HB 1...,0.999721
578,237,237,237,Dodd City ISD,First,pdf,http://toolbox1.s3-website-us-west-2.amazonaws...,Microsoft Word - DCInnovationPlan (2).docx Dod...,0.999735
126,56,56,56,Snyder ISD,First,pdf,https://1.cdn.edl.io/7b21ktMGjSuFIU9LdTLlk0PHz...,SISD Local Innovation Plan (1) Snyder ISD Dist...,0.999931
332,382,382,382,Mart ISD,First,pdf,http://www.martisd.org/upload/page/0001/docs/D...,Mart ISD District of Innovation Plan In Januar...,0.998699
186,320,320,320,Rivercrest ISD,First,pdf,http://s3.amazonaws.com/scschoolfiles/760/inno...,1 Rivercrest Independent School District Distr...,0.999955


## Collect phrases surrounding date match. Currently, function collects 8 words before and 6 words after but more if an entity is cut off.

In [69]:
date_phrases = []
for text in docs_to_label.text:
    phrases = extract_dates.get_phrase_list(text = text)
    for phrase in phrases:
        date_phrases.append(phrase)
date_phrases = [str(item) for item in date_phrases]

### Export and label. Rename as dates_to_label_labelled.csv

In [70]:
phrase_df = pd.DataFrame({'text':date_phrases, 'term' : '', 'finalize': ''})
phrase_df.to_csv(os.path.join(data_path, 'dates_to_label.csv'))

In [71]:
labelled = pd.read_csv(os.path.join(data_path, 'dates_to_label_labelled.csv'))
labelled.head()

Unnamed: 0.1,Unnamed: 0,text,term,finalize
0,0,of Innovation Plan 1 I. Introduction House Bil...,False,False
1,1,"the Texas Education Code. On February 21, 2017...",False,True
2,2,"and the community. On February 28, 2017, the B...",False,True
3,3,"March 2, 6, and 10, 2017, to discuss and draft",False,True
4,4,term of the Plan will begin with the 2017-2018...,True,False


In [72]:
labelled = labelled.replace({'FALSE': 'False', 'TRUE': 'True'})
phrase_df['finalize'] = list(labelled.finalize)
phrase_df['term'] = list(labelled.term)
phrase_df.head()

Unnamed: 0,text,term,finalize
0,of Innovation Plan 1 I. Introduction House Bil...,False,False
1,"the Texas Education Code. On February 21, 2017...",False,True
2,"and the community. On February 28, 2017, the B...",False,True
3,"March 2, 6, and 10, 2017, to discuss and draft",False,True
4,term of the Plan will begin with the 2017-2018...,True,False


In [73]:
phrase_df = phrase_df.sample(frac = 1)

In [74]:
print('The first n-30 of documents are used for training. Term date training dataset statistics:')
split = int(len(phrase_df) - 30)
print(phrase_df.head(split)['term'].value_counts())

print('The last 30 of documents are used for testing. Term date testing dataset statistics:')
print(phrase_df.tail(30)['term'].value_counts())

print('The first n-30 of documents are used for training. Finalize date training dataset statistics:')
split = int(len(phrase_df) - 30)
print(phrase_df.head(split)['finalize'].value_counts())

print('The last 30 of documents are used for testing. Finalize date testing dataset statistics:')
print(phrase_df.tail(30)['finalize'].value_counts())

The first n-30 of documents are used for training. Term date training dataset statistics:
False    101
True      26
Name: term, dtype: int64
The last 30 of documents are used for testing. Term date testing dataset statistics:
False    23
True      7
Name: term, dtype: int64
The first n-30 of documents are used for training. Finalize date training dataset statistics:
True     78
False    49
Name: finalize, dtype: int64
The last 30 of documents are used for testing. Finalize date testing dataset statistics:
True     19
False    11
Name: finalize, dtype: int64


# Train term classifier

In [75]:
texts, cats = classify.reformat_cat_from_df(df = phrase_df,
                                      text_col = 'text',
                                      label_col = 'term',
                                      category = 'TERM')
output_dir = os.path.join(data_path, 'date_finalize_classifier')

In [79]:
classify.train_classifier_and_evaluate(texts=texts, 
                                        cats=cats,
                                        n_test = 30,
                                        model=None, 
                                        output_dir=output_dir, 
                                        n_iter=10, 
                                        categories=['TERM'])

Created blank 'en' model
Using 157 examples (127 training, 30 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
7.101	0.500	0.000	0.000
4.617	0.857	0.857	0.857
3.408	0.857	0.857	0.857
2.898	0.700	1.000	0.824
2.558	0.778	1.000	0.875
1.515	0.778	1.000	0.875
1.948	0.875	1.000	0.933
0.943	0.778	1.000	0.875
1.539	0.778	1.000	0.875
2.309	0.778	1.000	0.875
No text {'TERM': 0.018151946365833282}
Saved model to /Users/kylieleblancKylie/dofis/code/exemptions/../../data/plans/date_finalize_classifier
Loading from /Users/kylieleblancKylie/dofis/code/exemptions/../../data/plans/date_finalize_classifier
No text {'TERM': 0.018151946365833282}


<spacy.pipeline.TextCategorizer at 0x11f6437b8>

# Train finalize classifier

In [80]:
texts, cats = classify.reformat_cat_from_df(df = phrase_df,
                                      text_col = 'text',
                                      label_col = 'finalize',
                                      category = 'FINALIZE')
output_dir = os.path.join(data_path, 'date_finalize_classifier')

In [81]:
classify.train_classifier_and_evaluate(texts=texts, 
                                        cats=cats,
                                        n_test = 30,
                                        model=None, 
                                        output_dir=output_dir, 
                                        n_iter=10, 
                                        categories=['FINALIZE'])

Created blank 'en' model
Using 157 examples (127 training, 30 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
5.526	0.864	1.000	0.927
3.900	1.000	1.000	1.000
2.099	1.000	1.000	1.000
1.455	1.000	0.947	0.973
0.836	1.000	0.947	0.973
0.731	1.000	1.000	1.000
0.661	1.000	0.947	0.973
0.552	1.000	0.895	0.944
0.367	1.000	0.947	0.973
0.299	1.000	0.947	0.973
No text {'FINALIZE': 0.6745890974998474}
Saved model to /Users/kylieleblancKylie/dofis/code/exemptions/../../data/plans/date_finalize_classifier
Loading from /Users/kylieleblancKylie/dofis/code/exemptions/../../data/plans/date_finalize_classifier
No text {'FINALIZE': 0.6745890974998474}


<spacy.pipeline.TextCategorizer at 0x11f642860>