In [1]:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [9]:
import sys
if sys.platform == "linux":
    sys.path.append('../')

    MODEL_ROOT = '../../models/'
    NLP_MODEL_ROOT = '../../nlp_models/'

In [11]:
pwd

'/media/swimmers3/ferrari_06/repo/billuminate/src/notebooks'

In [8]:
from

[0m[34;42mconda_envs[0m/                 [01;32mdump.sql[0m*    [01;32mREADME.md[0m*
[01;32mcongress_bills_schema.sql[0m*  [34;42mnlp_models[0m/  [34;42msrc[0m/


In [4]:
import os

import sqlalchemy
import sqlalchemy_utils

import numpy as np
import pandas as pd

import re
import ast
import json
import string
import xml.etree.ElementTree as ET

import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

In [5]:
import nltk

import spacy
nlp = spacy.load('en_core_web_lg')

### [File Schema Explanation](https://github.com/usgpo/bill-status/blob/master/BILLSTATUS-XML_User_User-Guide.md#3.-Action-Code-Element-Possible-Values)

### Helpful functions.

In [None]:
from

In [1]:
def select_random_rows(df, n_rows):
    ixs = np.random.choice(df.index.values, n_rows)
    df = df.reindex(ixs)
    return df

## Connect to Database.

In [4]:
# Connect to db wiht sqlalchemy
dbname = 'congressional_bills'
username = 'melissaferrari'
engine = sqlalchemy.create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://melissaferrari@localhost/congressional_bills


### Retrieve data.

In [5]:
bills_info = pd.read_sql_table('bills', con=engine)

In [6]:
bills_info.head(1)

Unnamed: 0,id,official_title,popular_title,url,bill_type,status_at,by_request,sponsor,updated_at,status,number,subjects_top_term,bill_id,introduced_at,congress,short_title
0,5529,To require the Securities and Exchange Commiss...,,https://www.gpo.gov/fdsys/bulkdata/BILLSTATUS/...,0,2015-03-03,False,153,2017-12-11 19:28:52,4.0,1216,Finance and financial sector,hr1216-114,2015-03-03,114,Maker-Taker Conflict of Interest Reform Act of...


In [7]:
query = """
        SELECT 
        bi.bill_id,
        sb.subject,
        bi.subjects_top_term,
        bi.official_title,
        bi.short_title,
        sb.bill_ix
        FROM subjects sb
        INNER JOIN bills bi
        ON sb.bill_ix=bi.id
        ;
        """
subject_table = pd.read_sql_query(query, engine)



In [8]:
subject_table.head(1)

Unnamed: 0,bill_id,subject,subjects_top_term,official_title,short_title,bill_ix
0,hr4764-114,Armed forces and national security,Armed forces and national security,To direct the Secretary of Veterans Affairs to...,Puppies Assisting Wounded Servicemembers (PAWS...,5530


In [9]:
query = """
        SELECT 
        bi.bill_id,
        sm.text AS summary_text,
        bt.text AS full_text,

        bi.subjects_top_term,
        bi.official_title,
        bi.short_title,

        bv.code,
        sm.as as summary_as,
        sm.date as summary_date,
        sm.bill_ix
        FROM summaries sm
        
        INNER JOIN bill_text bt
        ON sm.bill_ix=bt.bill_ix
        
        INNER JOIN bill_versions bv
        ON bv.id=bt.bill_version_id
        
        INNER JOIN bills bi
        ON sm.bill_ix=bi.id
        ;
        """
bill_join = pd.read_sql_query(query, engine)

In [10]:
bill_join.head(1)

Unnamed: 0,bill_id,summary_text,full_text,subjects_top_term,official_title,short_title,code,summary_as,summary_date,bill_ix
0,hr1617-113,Emergency Jobs to Restore the American Dream A...,"<?xml-stylesheet type=""text/xsl"" href=""billres...",Labor and employment,To create an emergency jobs program that will ...,Emergency Jobs to Restore the American Dream Act,IH,Introduced in House,2013-04-18,27079


In [11]:
len(bill_join)

30669

### Filtering correct bill version.
For each `bill_id` there are multiple versions of bill `full_text` indicated by the `code`, yet only one `summary_text` for each. <br>
For simplicitly, I would only like to consider the most recent bill text because that is likely to correspond to the version of the summary text provided. I have tried to determine the ordering of the codes from **most recent** to **least recent** so that I can select the most recent bill.

In [12]:
print('There are {} unique bills being analyzed and {} rows in the table'.format(bill_join.bill_id.nunique(), len(bill_join)))

There are 23795 unique bills being analyzed and 30669 rows in the table


In [13]:
def _return_correct_bill_version(df_bills,
                                 as_dict=False,
                                 code_order=['ENR', 'EAS', 'EAH', 'RS', 'ES', 'PCS', 'EH', 'RH', 'IS', 'IH']):
    # To create a 1-to-1 mapping of bill text and summaries
    # by choosing most recent bill text version
    
    num_rows = len(df_bills)
    if num_rows == 0:
        raise Exception('Oh no! This bill is not in the database.')
    elif num_rows > 1:
        code = next(i for i in code_order if i in df_bills['code'].unique())
        df_bills = df_bills[df_bills['code'] == code]
    if as_dict:
        return df_bills.iloc[0].to_dict()
    else:
        return df_bills

In [14]:
bill_join = bill_join.groupby('bill_id', group_keys=False).apply(lambda x: _return_correct_bill_version(x))

In [15]:
print('There are {} unique bills being analyzed and {} rows in the table'.format(bill_join.bill_id.nunique(), len(bill_join)))

There are 23795 unique bills being analyzed and 23825 rows in the table


#### ERROR:
For some reason some of the bill texts do not match their version number and they are not removed from the filtering. <br>
**FIX IN DATABASE**

In [16]:
group_sizes = bill_join.groupby('bill_id').size()
duplicates = bill_join[bill_join.bill_id.isin(group_sizes[group_sizes > 1].index)]

In [17]:
print('There are {} duplicates'.format(int(len(duplicates)/2)))

There are 30 duplicates


The error is in the `full_text` column.

In [18]:
duplicates.head(4)

Unnamed: 0,bill_id,summary_text,full_text,subjects_top_term,official_title,short_title,code,summary_as,summary_date,bill_ix
29563,hr1026-115,North Country National Scenic Trail Route Adju...,"<?xml-stylesheet type=""text/xsl"" href=""billres...",Public lands and natural resources,To revise the authorized route of the North Co...,North Country National Scenic Trail Route Adju...,PCS,Reported to House with amendment(s),2018-05-10,23945
29564,hr1026-115,North Country National Scenic Trail Route Adju...,"<?xml-stylesheet type=""text/xsl"" href=""billres...",Public lands and natural resources,To revise the authorized route of the North Co...,North Country National Scenic Trail Route Adju...,PCS,Reported to House with amendment(s),2018-05-10,23945
14170,hr1117-115,(This measure has not been amended since it wa...,"<?xml-stylesheet type=""text/xsl"" href=""billres...",Emergency management,To require the Administrator of the Federal Em...,,ENR,Public Law,2017-10-19,22163
14171,hr1117-115,(This measure has not been amended since it wa...,"<?xml-stylesheet type=""text/xsl"" href=""billres...",Emergency management,To require the Administrator of the Federal Em...,,ENR,Public Law,2017-10-19,22163


SyntaxError: unexpected EOF while parsing (<ipython-input-657-3be6a1f998ce>, line 1)

In [18]:
dup = duplicates[['full_text', 'code']].reset_index().values
dup[:,1] = list(map(lambda x: x.split('bill-stage="')[1].split('"')[0][0], dup[:,1]))
dup[:,2] = list(map(lambda x: x[0], dup[:,2]))

In [19]:
bad_ixs = dup[:,0][dup[:,1] != dup[:,2]]
good_ixs = dup[:,0][dup[:,1] == dup[:,2]]

In [20]:
bad_ixs = dup[:,0][dup[:,1] != dup[:,2]]
good_ixs = dup[:,0][dup[:,1] == dup[:,2]]
bill_join = bill_join[~bill_join.index.isin(bad_ixs)]
print('There are {} unique bills being analyzed and {} rows in the table'.format(bill_join.bill_id.nunique(), len(bill_join)))

There are 23795 unique bills being analyzed and 23795 rows in the table


### Subjects

In [151]:
bill = bill_join[bill_join.bill_id == 'hr664-115'].iloc[0].to_dict()

In [152]:
bill_id = bill['bill_id']
summary_text = bill['summary_text']
full_xml = bill['full_text']
subject = bill['subjects_top_term']
official_title = bill['official_title']
short_title = bill['short_title']
code = bill['code']

In [153]:
bill_id

'hr664-115'

In [1789]:
print(summary_text)

Stem the Tide of Overdose Prevalence from Opiate Drugs Act of 2017 or as the STOP OD Act of 2017

This bill permits the Centers for Disease Control and Prevention (CDC) to award grants: (1) to expand educational efforts to prevent abuse of opioids, which are drugs with effects similar to opium, such as heroin; (2) to promote treatment of persons who abuse opioids; and (3) to promote understanding of addiction.

The Department of Health and Human Services (HHS) may award grants to: (1) support first responders carrying and administering naloxone, which is a prescription drug used to rapidly reverse an opioid overdose; (2) establish processes for referral to treatment for opioid abuse; and (3) reimburse for testing for fentanyl in opioid overdoses and reporting the results to the CDC.

This bill amends the Controlled Substances Act to impose a fee on persons convicted of drug offenses. Collected amounts are made available for the HHS grants in this bill.

Specified agencies must submit t

### NOTES:
The bill `hr664-115` is a shady bill! It has the words "and other purposes" in the title.

In [423]:
bill_sub = list(subject_table[subject_table.bill_id == 'hr664-115'].subject.values)

In [1790]:
bill_sub = list(subject_table[subject_table.bill_id == bill_id].subject.values)

In [1791]:
print('Official title: \n{}'.format(official_title))
print('\nShort title: \n{}'.format(short_title))
print('\nMain subject: \n{}'.format(subject))
print('\nSub-subjects:')
print(*bill_sub, sep='\n')

Official title: 
To prevent the abuse of opiates, to improve response and treatment for the abuse of opiates and related overdoses, and for other purposes.

Short title: 
Stem the Tide of Overdose Prevalence from Opiate Drugs Act of 2017

Main subject: 
Health

Sub-subjects:
Civil actions and liability
Computer security and identity theft
Computers and information technology
Congressional oversight
Criminal procedure and sentencing
Drug therapy
Drug trafficking and controlled substances
Drug, alcohol, tobacco use
Emergency medical services and trauma care
Executive agency funding and structure
First responders and emergency personnel
Government information and archives
Government studies and investigations
Health
Health personnel
Health programs administration and funding
Health promotion and preventive care
Performance measurement


### Analyze full text structure.

#### Understanding xml tags.
`section` <br>
`subsection == (a), (b), ...`  **OR** `paragraph == (1), (2), ...`  
`subparagraph == (A), (B), ...`  <br>
`clause == (i), (ii), ...`  <br>
`subclause == (I), (II), ...`  <br>
`item == (aa), (bb), ...` <br>
`subitem == (AA), (BB), ...` <br>
`subsubitem == (aaa), (bbb), ...` <br>

In [660]:
if not duplicates.empty:
    print('yes')

yes


In [21]:
tag_rankings = {'bill':0, 'title': 1, 'section':2, 'subsection':3, 'paragraph':4, 
                'subparagraph':5, 'clause':6, 'subclause':7, 'item':8, 'subitem':9, 'subsubitem':10}

In [22]:
def _remove_whitespace(sentence_list):    
    white_space = list(string.whitespace)[1:]
    for ix in range(len(sentence_list)):
        for bad_string in white_space:
            if bad_string in sentence_list[ix]:
                sentence_list[ix] = sentence_list[ix].replace(bad_string, "")
    return sentence_list

In [23]:
def bill_from_xml(xml_string):
    
    xml_string = _remove_whitespace([xml_string])[0]

    match = re.search(r'<external-xref(.*?)>', xml_string)
    while match:
        start, end = (match.start(), match.end())
        xml_string = xml_string.replace(xml_string[start:end], "")
        match = re.search(r'<external-xref(.*?)>', xml_string)
    
    xml_string = xml_string.replace("</external-xref>", "")
    
    split_xml = xml_string.split("<legis-body")
    xml_root = split_xml[0] + "<legis-body"
    xml_string = split_xml[-1]
    
    # Close text tag before external-xref and term to avoid loss of information
    #xml_string = xml_string.replace("<external-xref", "</text><external-xref")
    #xml_string = xml_string.replace("</external-xref>", "</external-xref><text>")
    #xml_string = xml_string.replace("<term>", "</text><term>")
    #xml_string = xml_string.replace("</term>", "</term><text>")

    # Closing text tag didn't work because they weren't always embedded in <text>
    # Need to go back to this when I understand XML trees better. 
    xml_string = xml_string.replace("<quote>", "")
    xml_string = xml_string.replace("</quote>", "")
    xml_string = xml_string.replace("<term>", "")
    xml_string = xml_string.replace("</term>", "")
    
    xml_string = xml_root + xml_string
    txt_tree = ET.ElementTree(ET.fromstring(xml_string))
    txt_root = txt_tree.getroot()

    txt_extract = [[ix, elem.tag, elem.text] for ix, elem
                   in enumerate(txt_root.iter())]
    
    return txt_extract

In [145]:
def _clean_extracted_list(txt_extract,
                          tag_rankings=None):
    
    if not tag_rankings:
         tag_rankings = {'bill':0, 'title': 1, 'section':2, 'subsection':3, 'paragraph':4, 
                'subparagraph':5, 'clause':6, 'subclause':7, 'item':8, 'subitem':9, 'subsubitem':10}

    txt_df = pd.DataFrame(txt_extract)
    txt_df.columns = ['loc_ix', 'tag', 'text']    
    txt_df['tag_rank']  = txt_df['tag'].map(tag_rankings)

    # Drop pagebreak tag bc it causes errors
    txt_df = txt_df[txt_df.tag != 'pagebreak']

    # Drop header section and titles
    ix_min = txt_df[txt_df['tag']=='legis-body'].index.values[0]+1
    txt_df = txt_df.drop(txt_df.index[np.arange(ix_min)])

    # Add enumeration to front of each list item
    num_ixs = txt_df[txt_df['tag']=='enum']['loc_ix'].values
    for ix in num_ixs:
        txt_df.loc[ix+1, 'text'] = txt_df.reindex(range(ix,ix+2))['text'].str.cat(sep=' ')
        txt_df = txt_df.drop(ix)

    ## Concat the quote-blocks
    min_ixs = txt_df[txt_df.tag == 'quoted-block']['loc_ix'].values
    max_ixs = txt_df[txt_df.tag == 'after-quoted-block']['loc_ix'].values

    # Catch quote blocks in quote blocks
    if any(min_ixs[1:] < max_ixs[:-1]):
        for ix in range(len(min_ixs)-1):
            if min_ixs[ix+1] < max_ixs[ix]:
                min_ixs = np.delete(min_ixs, ix+1)
                max_ixs = np.delete(max_ixs, ix)

    for ix_loc in range(len(min_ixs)):
        txt_df.loc[min_ixs[ix_loc], 'text'] = txt_df.reindex(range(min_ixs[ix_loc]+1,max_ixs[ix_loc]+1))['text'].str.cat(sep=' ')
        txt_df = txt_df.drop(np.arange(min_ixs[ix_loc]+1,max_ixs[ix_loc]+1), errors='ignore')

    section_ix = txt_df[txt_df['tag'] == 'section']['loc_ix'].values
    # Collapse section text
    try:
        assert all(txt_df.reindex(section_ix)['tag'] == 'section')
        assert all(txt_df.reindex(section_ix+2)['tag'] == 'header')
        txt_df.loc[section_ix, 'text'] = txt_df.reindex(section_ix+2)['text'].values
        drop_list = np.append(section_ix + 1, section_ix + 2)
        txt_df = txt_df.drop(drop_list, errors='ignore')
    except: 
        diff = 1
        while section_ix.size !=0:
            inds = txt_df.reindex(section_ix+diff).dropna(subset=['loc_ix']).index.values
            if inds.size !=0:
                txt_df.loc[inds - diff, 'text'] = txt_df.reindex(inds)['text'].values
                txt_df = txt_df.drop(inds)
                rm_ix = txt_df.reindex(inds-diff)['loc_ix'].values
                section_ix = np.array(list(filter(lambda x: x not in rm_ix, section_ix)))
            diff += 1

    # Collapse subsection text
    subsection_ix = txt_df[txt_df['tag'] == 'subsection']['loc_ix'].values
    try:
        assert all(txt_df.reindex(subsection_ix)['tag'] == 'subsection')
        assert all(txt_df.reindex(subsection_ix+2)['tag'] == 'header')
        txt_df.loc[subsection_ix, 'text'] = txt_df.reindex(subsection_ix+2)['text'].values
        drop_list = np.append(subsection_ix + 1, subsection_ix + 2)
        txt_df = txt_df.drop(drop_list, errors='ignore')
    except: 
        diff = 1
        while subsection_ix.size !=0:
            inds = txt_df.reindex(subsection_ix+diff).dropna(subset=['loc_ix']).index.values
            if inds.size !=0:
                txt_df.loc[inds - diff, 'text'] = txt_df.reindex(inds)['text'].values
                txt_df = txt_df.drop(inds)
                rm_ix = txt_df.reindex(inds-diff)['loc_ix'].values
                subsection_ix = np.array(list(filter(lambda x: x not in rm_ix, subsection_ix)))
            diff += 1

    # Concat text between ranked tags
    ranked_tags = txt_df.dropna(subset=['tag_rank']).index.values
    ranked_tags = np.append(ranked_tags, max(txt_df.index)+1)
    for ix in range(len(ranked_tags)-1):
        txt_df.loc[ranked_tags[ix], 'text'] = txt_df.reindex(range(ranked_tags[ix],ranked_tags[ix+1]))['text'].str.cat(sep=' ')
        txt_df = txt_df.drop(np.arange(ranked_tags[ix]+1,ranked_tags[ix+1]), errors='ignore')

    # Remove short title if first section
    if 'short title' in txt_df.iloc[0]['text'].lower():
        txt_df = txt_df.drop(txt_df.iloc[0]['loc_ix'])

    return txt_df

#### Running the full text parser.
Make better except message.

In [146]:
# Select random rows to analyze
df = select_random_rows(bill_join, 100)

In [147]:
errors = []
for ix, bill in df.iterrows(): 
    xml_string = bill['full_text']
    try:
        txt_extract = bill_from_xml(xml_string)
        txt_df = _clean_extracted_list(txt_extract, tag_rankings=None)
    except:
        print(bill['bill_id'])
        errors.append(bill['bill_id'])

hr3939-113


## Analyze summary text.

In [233]:
def _tokenize_sentences(txt_string, nlp):
    doc = nlp(txt_string)
    txt_sent = [sent.string.strip() for sent in doc.sents]
    tokens = [token.text for token in doc if not token.is_stop]
    return doc, txt_sent

def tokenize_summ(summary_text, nlp, short_title):
    _, summ_sent = _tokenize_sentences(summary_text, nlp)
    if short_title.lower() in summ_sent[0].lower():
        summ_sent = summ_sent[1:]
    return summ_sent

def _apply_text_cleaning(sent):

    sent_clean = [_general_text_cleaning(s) for s in sent]
    sent_clean = _remove_punct(sent_clean)
    sent_clean = _make_lowercase(sent_clean)

    return sent_clean

def _remove_punct(sentences):
    # remove punctuations and special characters
    regex = re.compile(r"[^a-zA-Z0-9]")
    return [regex.sub(" ", s) for s in sentences]

def _make_lowercase(sentences):
    return [s.lower() for s in sentences]

def _general_text_cleaning(text):

    text = re.sub("\'s", "", text)
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)

    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)

    # remove comma between numbers    
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    
    # for hyphenated
    text = re.sub("[a-zA-Z0-9\-]*-[a-zA-Z0-9\-]*", "".join(text.split("-")) , text)
    
    # the single 's' in this stage is 99% of not clean text, just kill it
    text = re.sub(' s ', " ", text)
    
    # reduce extra spaces into single spaces
    text = re.sub('[\s]+', " ", text)
    
    return text

In [259]:
summ_sent = tokenize_summ(summary_text, nlp, short_title)
summ_sent_clean = _apply_text_cleaning(summ_sent)

### Compare summary and full text.

In [242]:
bill_id

'hr664-115'

In [241]:
xml_string = bill['full_text']

In [243]:
txt_extract = bill_from_xml(xml_string)

In [409]:
txt_extract = bill_from_xml(xml_string)
txt_df = _clean_extracted_list(txt_extract, tag_rankings=None)
txt_dflow = pd.DataFrame(txt_df[txt_df['tag_rank']>2]['text'], index=txt_df.index)
full_sent = list(txt_dflow.replace(np.nan, '', regex=True)['text'].values)
full_sent_clean = _apply_text_cleaning(full_sent)

### Try ROUGE and Fuzzy.

In [288]:
from fuzzywuzzy import fuzz

def _create_fuzzy_mat(sent_list_1, sent_list_2):
    fuzzy_mat = np.zeros([len(sent_list_1), len(sent_list_2)])
    for i in range(len(sent_list_1)):
        for j in range(len(sent_list_2)):
            fuzzy_mat[i][j] = fuzz.ratio(sent_list_1[i], sent_list_2[j])
    return fuzzy_mat*0.01

In [364]:
def _create_rouge_mat(sent_list_1, sent_list_2):
    metrics=['rouge-n']
    max_n=1
    weight_factor=1.2
    stemming=True
    evaluator = rouge.Rouge(metrics=metrics,
                            max_n=max_n,
                            limit_length=False,
                            alpha=0.5, # Default F1_score
                            weight_factor=weight_factor,
                            stemming=stemming)
    rouge_mat = np.zeros([len(sent_list_1), len(sent_list_2)])
    for i in range(len(sent_list_1)):
        for j in range(len(sent_list_2)):
            rouge_mat[i][j] = evaluator.get_scores(sent_list_2[j], sent_list_1[i])['rouge-1']['f']
    return rouge_mat

In [437]:
def _create_sim_mat(sent_vecs_1, sent_vecs_2,
                    embedding_size):
    sim_mat = np.zeros([len(sent_vecs_1), len(sent_vecs_2)])
    vlen = embedding_size
    for i in range(len(sent_vecs_1)):
        for j in range(len(sent_vecs_2)):
            sim_mat[i][j] = metrics.pairwise.cosine_similarity(sent_vecs_1[i].
                                                               reshape(1,
                                                                       vlen),
                                                               sent_vecs_2[j].
                                                               reshape(1,
                                                                       vlen))[0,
                                                                              0]
    return sim_mat

In [645]:
rouge_mat = _create_rouge_mat(full_sent_clean, summ_sent_clean)

In [646]:
rouge_mat.shape

(132, 9)

In [431]:
ix_match = _get_closest_sent_match_ix(rouge_mat)
ix_match

array([11, 14, 65, 13, 99, 76, 95, 45, 39])

In [422]:
ix_match = _get_closest_sent_match_ix(rouge_mat)
ix_match

array([11, 13, 14, 39, 45, 65, 76, 95, 99])

In [436]:
summ_sent[2]

'This bill amends the Controlled Substances Act to impose a fee on persons convicted of drug offenses.'

In [435]:
[full_sent[i] for i in ix_match][2]

'(II) provide updates to the Administrator, on a quarterly basis, of—'

In [388]:
[full_sent[i] for i in ix_match]

['(2) Promotion of treatment and recovery of persons who abuse such substances.',
 '(1) A State, with grants first being awarded to States with laws in effect that provide for immunity from civil liability for first responders and health professionals who administer naloxone in the course of their duty to counteract opiate overdoses.',
 '4. Grants for naloxone, training in the administration of naloxone, and testing for fentanyl',
 '(ii) a multiyear strategy to achieve the consolidation and optimization of the data centers inventoried under clause (i), that includes—',
 '(III) year-by-year calculations of investment and cost savings for the period beginning on the date of the enactment of this Act and ending on the date set forth in subsection (e), broken down by each year, including a description of any initial costs for data center consolidation and optimization and life cycle cost savings and other improvements, with an emphasis on—',
 '(II) provide updates to the Administrator, on 

In [439]:
full_sent_clean[:4]

['',
 ' 1  the increase in  1  the increase in fentanylrelated unintentional overdose fatalities presents another lifethreatening scenario for its victims and threatens firstresponders  unintentional overdose fatalities presents another  1  the increase in fentanylrelated unintentional overdose fatalities presents another lifethreatening scenario for its victims and threatens firstresponders  scenario for its victims and threatens  1  the increase in fentanylrelated unintentional overdose fatalities presents another lifethreatening scenario for its victims and threatens firstresponders  ',
 ' 2  the u s  sentencing commission ',
 ' a  ought to consider the presence of fentanyl in connection to the illicit distribution of an illicit substance  as a cutting agent  and']

In [624]:
embedding_size = 300
path_to_embedding = '../nlp_models/glove.6B/glove.6B.{}d.txt'.format(embedding_size)

In [625]:
glove_embeddings, embedding_size = _extract_embeddings(path_to_embedding)

'Health'

In [444]:
def _extract_embeddings(
        path_to_embedding='../nlp_models/glove.6B/glove.6B.300d.txt'):
    f = open(path_to_embedding, encoding='utf-8')
    word_embeddings = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    embedding_size = list(coefs.shape)[0]
    return word_embeddings, embedding_size

In [627]:
full_vec = _get_full_text_vectors(full_sent_clean, glove_embeddings, embedding_size, True)

In [628]:
summ_vec = _get_summary_text_vectors(summ_sent_clean, glove_embeddings, embedding_size, True)

In [456]:
from sklearn import metrics

In [620]:
def _get_closest_sent_match_ix(sim_mat, num_rows=1):

    ix_sort = (-sim_mat).argsort(axis=0)
    ix_match = ix_sort[0:num_rows, :]
    ix_match = np.sort(ix_match.flatten())
    ix_match = np.unique(ix_match)
    return ix_match  # closest_match


In [629]:
sim_mat = _create_sim_mat(full_vec, summ_vec, embedding_size=embedding_size)

In [647]:
rouge_mat.shape

(132, 9)

In [644]:
sim_mat.shape

(132, 9)

In [653]:
ix_good

array([  5,   9,  10,  14,  21,  23,  25,  27,  29,  30,  38,  39,  40,
        44,  45,  49,  50,  61,  67,  68,  70,  74,  83,  85,  87,  90,
        93,  96,  97,  99, 125])

In [655]:
len(ix_good)

31

In [654]:
len(ix_rouge)

45

In [652]:
set(ix_rouge) - set(ix_good)

{11,
 12,
 13,
 24,
 31,
 42,
 46,
 47,
 53,
 57,
 59,
 65,
 72,
 76,
 78,
 92,
 95,
 103,
 109,
 114,
 126,
 131}

In [651]:
set(ix_good) - set(ix_rouge)

{29, 30, 44, 49, 61, 68, 93, 97}

In [650]:
ix_rouge = set_important_label(rouge_mat)

In [639]:
def set_important_label(sim_mat, set_percentile=95):
    ix_good = []
    if len(summ_vec) < len(full_vec):
        while len(ix_good) < len(summ_vec):
            sim_mat_mask = np.where(sim_mat>=np.percentile(sim_mat, set_percentile), 1, 0)
            ix_good = np.unique(np.argwhere(sim_mat_mask!=0)[:,0])
            set_percentile -= 5
    else:
        print('summary length {} > {} bill length}'.format(len(summ_vec), len(full_vec)))    
        _, ix_good = _get_closest_sent_match_ix(sim_mat, 1)
    return ix_good

In [449]:
def _get_full_text_vectors(full_sent_clean, word_embeddings, embedding_size, not_leglove):
    full_vec = [_calc_embedding(s, word_embeddings, embedding_size, not_leglove)
                for s in full_sent_clean]
    return full_vec

In [450]:
def _get_summary_text_vectors(summ_sent_clean, word_embeddings,
                              embedding_size, not_leglove):
    summ_vec = [_calc_embedding(s, word_embeddings, embedding_size, not_leglove)
                for s in summ_sent_clean]
    return summ_vec

In [448]:
def _calc_embedding(sen, word_embeddings, embedding_size, not_leglove=True):
    if embedding_size is None:
        embedding_size = random.choice(list(word_embeddings.values())).shape
    if len(sen) != 0:
        if not_leglove:
            vector = sum([word_embeddings.get(w, np.zeros(embedding_size))
                        for w in sen.split()])/(len(sen.split())+0.001)
        else:
            sen_emb = []
            for w in sen.split():
                try: 
                    e = word_embeddings['word_vectors'][word_embeddings['dictionary'][w]]
                except: 
                    e = np.zeros((100,)).shape
                sen_emb.append(e)
                vector = sum(sen_emb)/(len(sen.split())+0.001)   

    else:
        vector = np.zeros(embedding_size)
    return vector


In [None]:

    print('loading spacy en_core_web_lg')
    start_time = time.time()
    nlp = spacy.load('en_core_web_lg')
    print("--- That took a {} seconds ---".format(time.time() - start_time))

    # filter_bills = filter_bills[:50]
    # print(filter_bills.columns)
    all_data, all_embed_data = aggregate_training_data(filter_bills,
                                                       version=version,
                                                       path_to_embedding=path_to_embedding,
                                                       nlp=nlp,  not_leglove=False)

    file_name = 'trainingdata_v{}_leGLOVEemb{}_{}.csv'.format(version, embedding_size, subject)
  

### determine subjects distribution