In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from path import Path, getcwdu

import glob
import os
from pathlib import PurePath
import copy
from bs4 import BeautifulSoup as bs


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, recall_score

In [2]:
train_file_list = [PurePath(file).name for file in glob.iglob('data/nc_training_filings/*')]
train_path_list = [PurePath(os.getcwd()).joinpath(file).as_posix() for file in glob.iglob('data/nc_training_filings/*')]
train_accession_ids = [PurePath(file).stem for file in train_file_list]
train_cik_nbrs = [x.split(sep='-')[0] for x in train_accession_ids]

In [3]:
val_accession_ids = train_accession_ids[300:]

Helper function for viewing paragraph text in training dataframe

In [5]:
def print_row_detail(df, nrow=10, header_list = ['ticker', 'accession_number' ],
                    detail_list = ['data_key_friendly_name', 'text', 'paragraph_text'],
                    sortby=['accession_number', 'data_key_friendly_name'], ascending=True):
    df_sorted = df.sort_values(sortby, ascending=ascending).reset_index()
    nrow = min(len(df_sorted), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

### Functions for extracting candidate documents from an html file

Function to create and index a dataframe from an html table. Empty cells are removed.

In [6]:
def df_from_html_tbl(table_tag):
    data_re = re.compile(r'[a-zA-Z0-9$().]+')
    repl = re.compile(r'[()$]')
    rows = []
    for row in table_tag.findChildren('tr'):
        row_list = []
        for s in row.strings:
            s = re.sub(repl, '', s.strip())
            if len(s) > 0:
                row_list.append(s)
        if len(row_list) > 1:
            rows.append(row_list)
    tbl_df = pd.DataFrame.from_records(rows)
    try:
        tbl_df = tbl_df.set_index( tbl_df.applymap(len).max().values.argmax()) 
    except: tbl_df = tbl_df.set_index(0)
    return tbl_df

### Test and refine regex patterns for flagging likely relevant documents

In [7]:
def check_regex_match(pattern, text_list):
    for idx, s in enumerate(text_list):
        mo = re.search(pattern, s)
        if mo:
            ms = mo.span()[1]
            print("------    " + str(idx) + "   Matched!    -----")
            print('str length  :' + str(len(s)) + '    match span  :' + str(ms))
            print(s[:ms])
            print('')
            print(s[ms:])
            print(re.search(pattern, s))
        else:
            print("------    " + str(idx) + "  NO MATCH    -----")
            print(s)

Patterns used to filter after initial regex

In [8]:
nonum_pats = [r"^[^\d]*$",
    r"^[^\d]*\d{1,2}[^\d]*$", 
    r"^([^\d]+(\d{1,2}[^\d]{2,}){1,5}\d{1,3}[^\d]*)$",
    r"^((\d{1,3}[^\d]{2,}){1,4}\d{1,2}[^\d]*)$"]
year_and_num = re.compile(r"20[0-2][0-9].*[0-9]{1,3}.*|[0-9]{1,3}.*20[0-2][0-9]", re.I)
nonum_regs = [re.compile(x) for x in nonum_pats]

In [15]:
paragraph_input_df = pd.read_csv('data/paragraph_input_df.csv', index_col=0)
paragraph_input_df['split'] = paragraph_input_df['split'].astype('category')

In [16]:
paragraph_input_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4259 entries, 0 to 4258
Data columns (total 5 columns):
acc_id       4259 non-null object
para_text    4259 non-null object
len          4259 non-null int64
split        4259 non-null category
label        4259 non-null int64
dtypes: category(1), int64(2), object(2)
memory usage: 170.6+ KB


In [17]:
paragraph_input_df.head(2)

Unnamed: 0,acc_id,para_text,len,split,label
0,0001193125-17-319357,"During 1998, we announced a program permitting...",943,train,1
1,0001193125-17-319357,"Subsequent to August 26, 2017, we have repurch...",288,train,1


In [49]:
# After reading back in, the html needs to be parsed again
tbl_html_df = pd.read_csv('tbl_html_df.csv')
tbl_html_df['tbl_html'] = tbl_html_df.tbl_html.apply(lambda x: bs(x, 'lxml'))

### Assign documents to positive and negative lists based on regex; also clean text

In [19]:
pos_list = paragraph_input_df.para_text[paragraph_input_df.label == 1].values.tolist()
neg_list = paragraph_input_df.para_text[paragraph_input_df.label == 0].values.tolist()
pos_key_list = paragraph_input_df.acc_id[paragraph_input_df.label == 1].values.tolist()
neg_key_list = paragraph_input_df.acc_id[paragraph_input_df.label == 0].values.tolist()

pos_list_train = [pos_list[i] for i,v in enumerate(pos_key_list) if v not in val_accession_ids]
pos_list_val = [pos_list[i] for i,v in enumerate(pos_key_list) if v in val_accession_ids]
neg_list_train = [neg_list[i] for i,v in enumerate(neg_key_list) if v not in val_accession_ids]
neg_list_val = [neg_list[i] for i,v in enumerate(neg_key_list) if v in val_accession_ids]

In [21]:
pos_tup_train = [(pos_key_list[i], pos_list[i]) for i,v in enumerate(pos_key_list) if v not in val_accession_ids]
pos_tup_val = [(pos_key_list[i], pos_list[i]) for i,v in enumerate(pos_key_list) if v in val_accession_ids]
neg_tup_train = [(neg_key_list[i], neg_list[i]) for i,v in enumerate(neg_key_list) if v not in val_accession_ids]
neg_tup_val = [(neg_key_list[i], neg_list[i]) for i,v in enumerate(neg_key_list) if v  in val_accession_ids]

In [22]:
neg_key_list_train, neg_list_train = ([k[0] for k in neg_tup_train], [k[1] for k in neg_tup_train])
neg_key_list_val, neg_list_val = ([k[0] for k in neg_tup_val], [k[1] for k in neg_tup_val])
pos_key_list_train, pos_list_train = ([k[0] for k in pos_tup_train], [k[1] for k in pos_tup_train])
pos_key_list_val, pos_list_val = ([k[0] for k in pos_tup_val], [k[1] for k in pos_tup_val])

54 keys keys missed out of 300 covered  (11 if table texts are included) (2 only had table paragraphs)

In [23]:
print('total paragraphs with positive hits: ' + str(len(pos_list)))
print('training paragraphs with positive hits: ' + str(len(pos_list_train)))
print('training keys with positive hits: ' + str(len(set(pos_key_list_train))) + ' out of 300')
print('validation keys with positive hits: ' + str(len(set(pos_key_list_val))) + ' out of 135')

total paragraphs with positive hits: 1098
training paragraphs with positive hits: 764
training keys with positive hits: 246 out of 300
validation keys with positive hits: 112 out of 135


In [24]:
print('total paragraphs with no regex match: ' + str(len(neg_list)))
print('training paragraphs with no regex match: ' + str(len(neg_list_train)))
print('training keys with paragraphs labeled negative: ' + str(len(set(neg_key_list_train))) + ' out of 300')
print('validation keys with paragraphs labeled negative: ' + str(len(set(neg_key_list_val))) + ' out of 135')

total paragraphs with no regex match: 3161
training paragraphs with no regex match: 2240
training keys with paragraphs labeled negative: 294 out of 300
validation keys with paragraphs labeled negative: 133 out of 135


List of keys with no paragraphs flagged as relevant by the regex

In [26]:
missed_keys = [key for key in neg_key_list if key not in pos_key_list ]

Used for manually labeling documents as relevant or not

In [27]:
def print_docs_from_list(key_list: list=neg_key_list, doc_list: list=neg_list, start: int=0, ndocs: int=50):
    end = start + ndocs
    for idx, tup in enumerate(zip(key_list[start:end], doc_list[start:end])):
        print(str(idx + start) + '   ------   ' + str(tup[0]))
        print(tup[1])

In [28]:
false_pos_indices = []
false_neg_indices = []

In [31]:
# After reading back in, the html needs to be parsed again
tbl_html_df = pd.read_csv('data/tbl_html_df.csv')
tbl_html_df['tbl_html'] = tbl_html_df.tbl_html.apply(lambda x: bs(x, 'lxml'))

In [32]:
false_pos = []; false_neg = [];
false_pos = [pos_list[:200][i] for i in false_pos_indices  ]
false_neg = [neg_list[:200][i] for i in false_neg_indices  ]

In [33]:
pos_labeled = [x for x in pos_list[:200] if x not in false_pos] + false_neg
neg_labeled = [x for x in neg_list[:200] if x not in false_neg] + false_pos
train_labeled = pos_labeled + neg_labeled

## Building labeled training set for first document classifier

### Find the tokens that best identify misses from the regex

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer

from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer

import numpy as np

Tokenize dollar values as dollar tokens, numerics as numeric tokens, but leave years as they are

In [35]:
year_pat = re.compile(r"([^0-9])((?:20|19)[0,1,2,9][0-9])([^0-9])")
dollar_pat = re.compile(r"[$]([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}")
num_pat = re.compile(
r"(y(?:20|19)[0,1,2,9][0-9])|(?:(?P<bound1>[\s,.])(?:(?:[0-9]{1,3}[,])*(?:[0-9]{1,3}))(?:[.][0-9]{1,4})?[%]?(?P<bound2>[\s,.]))")
year_fix_pat = re.compile(r"y((?:20|19)[0,1,2,9][0-9])num_tok")

def replace_numeric_toks(s):
    s1 = re.sub(year_pat, r'\1y\2\3', s )
    s2 = re.sub(dollar_pat, r'dollar_tok', s1)
    s3 = re.sub(num_pat, r'\1\g<bound1>num_tok\g<bound2>', s2)
    s4 = re.sub(year_fix_pat, r'\1', s3)
    return s4

In [64]:
#train_labeled_tok_ready = [replace_numeric_toks(x) for x in train_labeled]
#y = np.array(list(np.repeat(1, len(pos_labeled))) + list(np.repeat(0, len(neg_labeled))))

In [36]:
X = [replace_numeric_toks(x) for x in pos_list + neg_list]
X_train = [replace_numeric_toks(x) for x in pos_list_train + neg_list_train]
X_val = [replace_numeric_toks(x) for x in pos_list_val + neg_list_val]
y = np.array(list(np.repeat(1, len(pos_list))) + list(np.repeat(0, len(neg_list))))
y_train = np.array(list(np.repeat(1, len(pos_list_train))) + list(np.repeat(0, len(neg_list_train))))
y_val = np.array(list(np.repeat(1, len(pos_list_val))) + list(np.repeat(0, len(neg_list_val))))

Function to cross-validate models

In [66]:
def cv_acc(model, X, y, cv=5, scoring='accuracy'):
    cv_dict = {}
    cvs = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    cv_dict['cv_mean'] = np.mean(cvs)
    cv_dict['cvs'] = cvs
    return cv_dict

In [67]:
model_params_dict = {}
model_cvs = {}

In [68]:
count_vec = CountVectorizer(ngram_range=(1,6), max_df=0.6, min_df=.025, max_features=5000)
bin_vec = CountVectorizer(ngram_range=(1,6), max_df=0.2, min_df=.01, binary=True, max_features=5000)
tfidf_vec = TfidfVectorizer(ngram_range=(1,6), max_df=0.62, min_df=.025, max_features=5000)

# Define a pipeline combining a text feature extractor with a simple
# classifier

# Logistic Regression 
lr_tfidf_pl = Pipeline([
    ('vec', tfidf_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

lr_bin_pl = Pipeline([
    ('vec', bin_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

In [69]:
param_grid_bin = {
    'vec__max_df': (0.15, .2, .25),
    'vec__min_df': (.005, .01, .015),
    'vec__ngram_range' : [(1,6)]
}

In [70]:
param_grid_tfidf = {
    'vec__max_df': (.4, 0.5, .6, .7,),
    'vec__min_df': (.005, .01, .015, .025),
    'vec__ngram_range' : [(1,6)]
}

In [71]:
lr_tfidf_gs = GridSearchCV(lr_tfidf_pl, param_grid=param_grid_tfidf, cv=4, return_train_score=True)
lr_bin_gs = GridSearchCV(lr_bin_pl, param_grid=param_grid_bin, cv=4, return_train_score=True)

In [72]:
lr_bin_gs.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=5000, min_df=0.01,
        ngram_range=(1, 6), preprocessor=None, stop_words=None,
        stri...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__max_df': (0.15, 0.2, 0.25), 'vec__min_df': (0.005, 0.01, 0.015), 'vec__ngram_range': [(1, 6)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [73]:
lr_tfidf_gs.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.62, max_features=5000, min_df=0.025,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__max_df': (0.4, 0.5, 0.6, 0.7), 'vec__min_df': (0.005, 0.01, 0.015, 0.025), 'vec__ngram_range': [(1, 6), (1, 7)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [74]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_bin'] = lr_bin_gs.best_params_
lr_bin_gs.best_params_

{'vec__max_df': 0.15, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 6)}

In [76]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_tfidf'] = lr_tfidf_gs.best_params_
lr_tfidf_gs.best_params_

{'vec__max_df': 0.7, 'vec__min_df': 0.01, 'vec__ngram_range': (1, 6)}

In [77]:
lr_tfidf_gs.cv_results_['mean_test_score'].max()

0.8868175765645806

In [78]:
lr_bin_gs.cv_results_['mean_test_score'].max()

0.9171105193075899

In [184]:
# Manually entering the parameters already discovered via GridsearchCV
model_params_dict['lr_bin'] = {'vec__max_df': 0.15, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 6)}

Set the model params using the dictionary. The `**` unpacks the dictionary into the key-value pairs - essentially, it removes the curly braces.

In [80]:
lr_bin_pl.set_params(**model_params_dict['lr_bin'])
lr_bin_pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.15, max_features=5000, min_df=0.005,
        ngram_range=(1, 6), preprocessor=None, stop_words=None,
        st...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [82]:
lr_tfidf_pl.set_params(**model_params_dict['lr_tfidf'])
lr_tfidf_pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=5000, min_df=0.01,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

Get and print model cross validation scores:

In [84]:
model_cvs['lr_bin'] = cv_acc(lr_bin_pl, X_train, y_train)
print(model_cvs['lr_bin'])

{'cv_mean': 0.9187692734331669, 'cvs': array([0.92678869, 0.90682196, 0.93178037, 0.92678869, 0.90166667])}


Against the unseen holdout/validation set: Basically the same as the CV folds

In [83]:
lr_bin_pl.score(X_val, y_val)

0.9051792828685259

In [86]:
model_cvs['lr_tfidf'] = cv_acc(lr_tfidf_pl, X_train, y_train)
print(model_cvs['lr_tfidf'])

{'cv_mean': 0.8888053244592345, 'cvs': array([0.91181364, 0.86189684, 0.90682196, 0.90349418, 0.86      ])}


In [87]:
y_pred = lr_bin_pl.predict(X)
y_val_pred = lr_bin_pl.predict(X_val)
y_proba = lr_bin_pl.predict_proba(X)

In [88]:
misses = y_pred != y
#X_test_misses = [X_test[i] for i in range(0, len(misses)) if misses[i] == True]
X_misses = [X[i] for i in range(0, len(misses)) if misses[i] == True]
len(X_misses)

120

Second element in each row has the probability of y=1, according to the LR model

In [191]:
y_proba[:,1].tolist()[1]

0.7600372060407212

In [89]:
for idx, tup in enumerate(zip(X_misses, y_proba[misses][1].tolist(), y[misses])):
    print("--------------------    " + str(idx) + "    --------------------" )
    print("Labeled value :  " + str(tup[2]))
    print("Predicted probability :  " + str(tup[1]))
    print(tup[0])
    if idx > 9:
        break

--------------------    0    --------------------
True value :  1
Predicted probability :  0.8921753830869105
Snap-on has undertaken repurchases of Snap-on common stock from time to time to offset dilution created by shares issued for employee and franchisee stock purchase plans, stock awards and other corporate purposes. Snap-on repurchased num_tok shares, num_tok shares and num_tok shares in 2017, 2016 and 2015, respectively. As of 2017 year end, Snap-on has remaining availability to repurchase up to an additional dollar_tok million in common stock pursuant to Board authorizations. The purchase of Snap-on common stock is at the company's discretion, subject to prevailing financial and market conditions.
--------------------    1    --------------------
True value :  1
Predicted probability :  0.10782461691308949
On November num_tok, 2016, we announced plans to purchase up to dollar_tok billion of our common stock through 2019. On March num_tok, 2017, we announced plans to double our 

In [90]:
print(classification_report(y_val, y_val_pred))

             precision    recall  f1-score   support

          0       0.92      0.96      0.94       921
          1       0.86      0.76      0.81       334

avg / total       0.90      0.91      0.90      1255



In [93]:
coef_tfidf_df = pd.DataFrame.from_records(list(zip(lr_tfidf_pl.named_steps.vec.get_feature_names(), 
                                             lr_tfidf_pl.named_steps.lr.coef_.tolist()[0])),
                                   columns = ['tfidf_token', 'tfidf_coef']).sort_values('tfidf_coef', ascending=False).reset_index(drop=True)

In [94]:
coef_bin_df = pd.DataFrame.from_records(list(zip(lr_bin_pl.named_steps.vec.get_feature_names(), 
                                             lr_bin_pl.named_steps.lr.coef_.tolist()[0])),
                                   columns = ['token', 'coef']).sort_values('coef', ascending=False).reset_index(drop=True)

In [95]:
pd.concat([coef_tfidf_df.head(20), coef_bin_df.head(20)], axis=1)

Unnamed: 0,tfidf_token,tfidf_coef,token,coef
0,repurchased num_tok,3.032666,accelerated share,2.183093
1,accelerated share,2.97167,accelerated share repurchase,1.975529
2,accelerated share repurchase,2.772415,repurchased num_tok,1.936068
3,accelerated,2.380541,authorized share repurchase,1.421375
4,repurchased num_tok million,2.228513,repurchased num_tok million shares,1.242539
5,repurchased num_tok million shares,2.047673,accelerated,1.171833
6,num_tok million shares,1.948307,repurchased num_tok shares,1.13184
7,million shares,1.946904,repurchased num_tok million,1.125678
8,we repurchased num_tok,1.751198,authorized share,1.096774
9,authorized share repurchase,1.744254,authorized the repurchase,1.053928


In [96]:
pd.concat([coef_tfidf_df.tail(20).reset_index(drop=True), coef_bin_df.tail(20).reset_index(drop=True)], axis=1)

Unnamed: 0,tfidf_token,tfidf_coef,token,coef
0,issued,-0.62143,over,-0.388198
1,following,-0.623518,at the,-0.389273
2,income,-0.639322,transaction,-0.404682
3,on the,-0.650178,the following,-0.407363
4,november num_tok 2017,-0.654391,discretion,-0.409595
5,and dollar_tok,-0.666065,november num_tok 2017,-0.434913
6,table,-0.677647,million of common,-0.44184
7,by the,-0.679719,dollar_tok million of common,-0.444438
8,million remaining,-0.680569,2017 and,-0.455317
9,capacity,-0.699376,of shares,-0.470325


In [98]:
paragraph_input_df['rel_proba'] = y_proba[:,1].tolist()

In [99]:
paragraph_input_df.head()

Unnamed: 0,acc_id,para_text,len,split,label,rel_proba
0,0000002969-17-000039,"On 15 September 2011, the Board of Directors a...",654,train,1,0.999732
1,0000002969-17-000039,"On 15 September 2011, the Board of Directors a...",282,train,1,0.998451
2,0000002969-17-000039,"On 15 September 2011, the Board of Directors a...",442,train,1,0.997926
3,0000004127-17-000033,(1)The share repurchase program approved by th...,369,train,1,0.996196
4,0000004127-17-000033,"(3) 600,000 shares were repurchased at an aver...",235,train,1,0.920757


# Tokenize into sentences and classify

In [218]:
srp_df[srp_df.data_key_friendly_name == 'Share Repurchase Authorization Date'].text.apply(replace_numeric_toks).value_counts().sort_index()

15 September 2011           1
1986                        1
1999                        1
2001                        1
2004                        1
2009                        1
2012                        1
2013                        2
2015                        1
2016                        3
2017                        2
April 2016                  1
April 2017                  7
April num_tok, 2014         1
April num_tok, 2015         4
April num_tok, 2016         4
April num_tok, 2017         4
August 2004                 2
August 2015                 2
August 2016                 2
August 2017                 3
August num_tok, 2000        1
August num_tok, 2015        2
August num_tok, 2016        2
August num_tok, 2017        3
December 2013               2
December 2014               2
December 2015               1
December 2016               6
December 2017              14
                           ..
November 2016               8
November 2017               7
November n

In [108]:
auth_date_pat_list = [
r"(jan\w+|feb\w+|march|april|may|june|july|aug\w+|sept\w+|octob\w+|nov\w+|decem\w+) (20)?[0-3]?[0-9],( 20[1-2][0-9][,])?",
#r"[0-3][0-9][,] (jan\w+|feb\w+|march|april|may|june|july|aug\w+|sept\w+|nov\w+|decem\w+)( 20[1-2][0-9])?",
r"(20|19)[9,0,1][0-9][,]?((\w+|[()-,])* )*((announc\w*|author\w*|approv\w*) )((\w+|[()-,])* )*repurchase ((\w+|[()-,])* )*((our|common|outstanding) (shar(e|es)|stock))" 
    #r"((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion )?", 
]

auth_date_regs = [re.compile(x, re.I) for x in auth_date_pat_list]

In [100]:
para_list = paragraph_input_df.para_text.values.tolist()

In [107]:
def month_tok(s):
    """Replace month names with month_tok. """
    month_pat = re.compile(r"jan\w+|feb\w+|march|april|june|july|aug\w+|sept\w+|octob\w+|nov\w+|decem\w+", re.I)
    may_pat = re.compile(r"\WM(ay|AY)\W")
    s_tok = re.sub(month_pat, "month_tok", s)
    s_tok = re.sub(may_pat, "month_tok", s_tok)
    return s_tok


In [111]:
type(auth_date_regs[0])

_sre.SRE_Pattern

In [None]:
%time len([any(reg.search(x) for reg in auth_date_regs) for x in  para_list[:100]])

In [None]:
%time len(x if any(reg.search(x))  for x in  para_list[:1] for reg in auth_date_regs)

In [220]:

len(paragraph_input_df.groupby('acc_id').first())

434

In [None]:
paragraph_input_df['auth_date_hit'] = paragraph_input_df.para_text.apply(lambda x: any(reg.search(x) for reg in auth_date_regs))

In [None]:
reg_bs = paragraph_input_df.para_text.apply(lambda x: any(reg.search(x) for reg in auth_date_regs))
len(paragraph_input_df[reg_bs].groupby('acc_id').first())

#### Labels vs. model output

In [106]:
print_row_detail(df=paragraph_input_df, nrow=20, header_list = ['acc_id'],
                    detail_list = ['label', 'rel_proba', 'para_text'],
                    sortby=['acc_id', 'rel_proba'], ascending=True)

----------------------------------- 0000002969-17-000039 -----------------------------------
label  :1

rel_proba  :0.9979259632276545

para_text  :On 15 September 2011, the Board of Directors authorized the repurchase of up to $1,000 of our outstanding common stock. We repurchase shares pursuant to Rules 10b5-1 and 10b-18 under the Securities Exchange Act of 1934, as amended, through repurchase agreements established with several brokers. We did not purchase any of our outstanding shares during fiscal year 2017. At 30 September 2017, $485.3 in share repurchase authorization remains.

----------------------------------- 0000002969-17-000039 -----------------------------------
label  :1

rel_proba  :0.998451479822388

para_text  :On 15 September 2011, the Board of Directors authorized the repurchase of up to $1,000 of our outstanding common stock. We did not purchase any of our outstanding shares during fiscal years 2017, 2016 or 2015 . At 30 September 2017, $485.3 in share repurchase a

In [None]:
sent_pat_list = [r"Board( of Directors)?( has)? authorized the repurchase", 
r"(on|in|at|as of) (jan\w+|feb\w+|march|april|may|june|july|august|sept\w+|novem\w+|decem\w+) (20)?[0-9]{1,2}, (20[1-2][0-9][,] )?((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion )?", 
r"((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion)?",       
r"Board( of Directors)?( has)? (approved|authorized) a( new)? (share|stock) repurchase program", 
r"Board( of Directors)? approved an( additional)? increase in the stock repurchase",
r"share repurchase authorization by the board",
r"shares (rep|p)urchased as Part of Public", 
r"accelerated share repurchase",     
r"(re)?purchase(d)?[,]? (up to )?(an aggregate |in aggregate, |a )?(total )?(of )?((up to|approximately) )?(([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?)(and (([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?))?(shares )?of ((our|its|the|/w+[']s) ){1,2}common stock",

r"(authorized|approved) a share repurchase program", 
r"authorized the repurchase of (shares|up to)", 
r"authorized repurchases of up to ([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )shares",
r"authorized share repurchase program", 
r"authorization replace(d|s) (the|all|any) prior repurchase authorization", 
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion)? shares were repurchased", 
r"(approximately )?[$](([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)? to repurchase (approximately )?[$]?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )?(shares )?of (\w+([']s)? )?Common Stock",
r"repurchased ([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) (million )?(common )?shares", 
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )? stock repurchase",
r"(Company|we) (repurchased|purchased) (approximately )?([$])?([0-9]{,4}[.]?[0-9]{,4}) million shares", 
r"we did not repurchase any shares", 

r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?(of our common stock )?remain((s|ing|ed) )?under ((the|our|publicly announced) ){0,3}((authoriz/w+|program(s)?|share|repurchase)\s?){1,4}",
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?(of our common stock )?remain((s|ing|ed) )?to be repurchase",
r"million common shares remaining under the", 
r"remai(n|ned|ning) under (our|the) share repurchase (authorization|program)", 
r"The Company currently plans to (rep|p)urchase ([$]?)([0-9]{,4}[.]?[0-9]{,4}) ((m|b)illion )?(to ([$]?)([0-9]{,4}[.]?[0-9]{,4}) (m|b)illion )?(shares|of its common stock)",
r"(approximately )?([$]?)([0-9]{1,4}[.]?[0-9]{0,4}) (m|b)illion ((shares|of|our|common|stock) ){0,5}remai(ned|n) (available|under the authorization|authorized)",
r"ha(d|s) (approximately )([$]?)([0-9]{1,4}[.]?[0-9]{0,4}) (m|b)illion remaining under ((the|this )?repurchase authorization",
        
           ]

r"(shares|amount) ((available|remaining) )((for|under|the|share) ){1,5}repurchase ((\w+)[,]? ){0,4}share repurchase((\w+)[,]? ){1,6}([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)?( share(s)?)?",            
r"share repurchase authorization remain|remained|by the board)",
r"Shares (remaining )?that May Yet Be Purchased", 
remain_reg_list = [re.compile(x, re.I) for x in pat_list]