# Train relevance classifier

This script trains the relevance classifier meant to identify DOI plans. 

In [17]:
import pandas as pd
import os
import classify

## Label documents

Do not overwrite labelled document set unless you want to relabel. relabel_docs = False protects labelled dataset. Sample size is currently 140. You may decide to add additional DOI plans. This can be done with add_extra_trues. 

In [2]:
relabel_docs = False
sample_size = 140
add_extra_trues = True
num_new_true = 120

In [3]:
if relabel_docs:
    text_scraped = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_scraped.csv'))
    docs_to_label = text_scraped.sample(n = sample_size)
    docs_to_label.to_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'docs_to_label.csv'))
    labeled_docs = docs_to_label

In [4]:
labeled_docs = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'docs_to_label.csv'))
labeled_docs['innovation_plan'] = [
         False, False, False, True, False,
         False, True, False, False, False,
         False, False, False, False, False,
         False, False, False, False, False,
    
         True, True, False, True, False,
         False, True, True, True, False,
         False, False, False, True, False,
         False, False, False, False, False,
    
         False, False, False, False, False,
         False, False, False, False, False,
         False, False, False, False, False,
         False, False, False, True, False,
    
         False, True, False, False, True, 
         False, False, False, False, True,
         False, False, False, False, False,
         False, False, False, False, False, 
        
         True, True, False, False, False, 
         True, False, False, True, False, 
         False, False, False, False, False, 
         False, True, True, False, True,

         False, True, False, False, False, 
         False, False, True, True, False, 
         False, True, True, False, False, 
         False, False, False, False, False, 

         False, False, True, False, False, 
         False, True, True, False, False, 
         False, False, False, True, False, 
         False, False, True, True, False]
print(labeled_docs['innovation_plan'].value_counts())
labeled_docs.head(5)

False    109
True      31
Name: innovation_plan, dtype: int64


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,level,link,text,title,type,innovation_plan
0,0,2683,2683,2683.0,Second,https://www.celinaisd.com/wp-documents/Require...,Date Run: Program: FIN1750 Cnty Dist: Page: 1 ...,Celina ISD,pdf,False
1,1,1910,1910,1910.0,Second,https://www.celinaisd.com/wp-documents/Require...,BUDGET RECEIVED TO DATE REMAINING PERCENT REMA...,Celina ISD,pdf,False
2,2,455,455,455.0,Second,https://1.cdn.edl.io/BoIQrMqDBRiGKjWEV3Sjf2tgY...,No text,West ISD,pdf,False
3,3,271,271,271.0,First,http://images.pcmac.org/Uploads/CalallenISD/Ca...,Calallen ISD: Put Us to the Test; We Are the B...,Calallen ISD,pdf,True
4,4,1020,1020,1020.0,Second,https://4.files.edl.io/436e/05/11/18/191230-4e...,Microsoft Word - Alto ISD Calendar 2018-2019 J...,Alto ISD,pdf,False


This adds extra trues to data used for training, but not testing. 

In [5]:
if add_extra_trues:
    docs_true = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'text_scraped.csv'))
    docs_true = docs_true[docs_true.level == 'First']
    docs_true = docs_true[docs_true.text != 'UNAVAILABLE']
    docs_true = docs_true[docs_true.text != 'No text']
    docs_true = docs_true[~docs_true.text.str.contains('Page Not Found')]
    
    #docs_true = docs_true[docs_true.level == 'First'][docs_true.text != 'UNAVAILABLE'][docs_true.text != 'No text']['Page Not Found' not in docs_true.text]
    docs_true = docs_true.sample(n = num_new_true)
    docs_true['innovation_plan'] = True
    # when we split into training and testing, all of the trues will be in training. 
    labeled_docs = docs_true.append(labeled_docs, ignore_index = True, sort = False) 
    print('New number of labelled documents = ', len(labeled_docs))
    labeled_docs.head()

New number of labelled documents =  260


In [6]:
print('The first n-30 of documents are used for training. Training dataset statistics:')
split = int(len(labeled_docs) - 30)
print(labeled_docs.head(split)['innovation_plan'].value_counts())

print('The last 30 of documents are used for testing. Testing dataset statistics:')
print(labeled_docs.tail(30)['innovation_plan'].value_counts())

The first n-30 of documents are used for training. Training dataset statistics:
True     143
False     87
Name: innovation_plan, dtype: int64
The last 30 of documents are used for testing. Testing dataset statistics:
False    22
True      8
Name: innovation_plan, dtype: int64


## Train Classifier and Save

In [7]:
texts, cats = classify.reformat_cat_from_df(df = labeled_docs,
                                      text_col = 'text',
                                      label_col = 'innovation_plan',
                                      category = 'INNOVATION')

In [8]:
output_dir = os.path.join(os.getcwd(), '..', '..', 'data', 'document_classifier')

In [9]:
classify.train_classifier_and_evaluate(texts=texts, 
                                        cats=cats, 
                                        model=None, 
                                        output_dir=output_dir, 
                                        n_iter=10, 
                                        categories=['INNOVATION'])

  return f(*args, **kwds)
  return f(*args, **kwds)


Created blank 'en' model
Using 260 examples (230 training, 30 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
15.060	0.500	1.000	0.667
7.529	0.583	0.875	0.700
4.777	0.615	1.000	0.762
4.970	0.571	1.000	0.727
1.724	0.727	1.000	0.842
1.070	0.667	1.000	0.800
1.055	0.667	1.000	0.800
0.871	0.667	1.000	0.800
0.795	0.667	1.000	0.800
0.814	0.727	1.000	0.842
No text {'INNOVATION': 0.001259499229490757}
Saved model to /Users/kylieleblancKylie/dofis/code/exemptions/../../data/document_classifier
Loading from /Users/kylieleblancKylie/dofis/code/exemptions/../../data/document_classifier
No text {'INNOVATION': 0.001259499229490757}


<spacy.pipeline.TextCategorizer at 0x11f332080>