![alt text](https://s3.amazonaws.com/blaze4/HK+logo.png)

# *LIBOR TRANSITION ASSISTANT (LTA)*

Holland & Knight's LIBOR Transition Assistant (LTA) can significantly reduce the time and cost ordinarily associated with portfolio review projects like those being undertaken ahead of the discontinuance of LIBOR. Many of the solutions offered by other professional firms are little more than white-labeled platforms originally designed to tackle different tasks. In contrast, LTA was built from the ground up to assist financial institutions in the transition away from LIBOR. The LTA is the product of Holland & Knight's own data scientists, technologists and subject matter experts, and was developed using state-of-the-art open source machine learning algorithms.  As a result, we are constantly improving our LTA solution, and where necessary, maintaining the ability to tailor it to an individual lender's specific loan portfolio, workflows and internal systems.    

***

In [None]:
#%%capture
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
!pip install bert-tensorflow
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from tensorflow import keras
import os
import re,sys

if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] =  'elite-mix-229322-5ae3dc690187.json'
#service = googleapiclient.discovery.build('ml', 'v1')

# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = ''#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET_HOTDOG = 'bert_hk3' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET_HOTDOG, OUTPUT_DIR)


if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

train_hotdog = pd.read_excel('libor_train.xlsx')
test_hotdog = pd.read_excel('libor_test.xlsx')
train_hotdog = train_hotdog.sample(500)
test_hotdog = test_hotdog.sample(160)
train_hotdog.columns
DATA_COLUMN_HOT = 'Text'
LABEL_COLUMN_HOT = 'category_id'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list_hot = [0, 1]

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples_hot = train_hotdog.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN_HOT], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN_HOT]), axis = 1)

test_InputExamples_hot = test_hotdog.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN_HOT], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN_HOT]), axis = 1)
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
        return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features_hot = bert.run_classifier.convert_examples_to_features(train_InputExamples_hot, label_list_hot, MAX_SEQ_LENGTH, tokenizer)
test_features_hot = bert.run_classifier.convert_examples_to_features(test_InputExamples_hot, label_list_hot, MAX_SEQ_LENGTH, tokenizer)

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(
        BERT_MODEL_HUB,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)

def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

# Compute train and warmup steps from batch size
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features_hot) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

model_fn = model_fn_builder(
  num_labels=len(label_list_hot),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator_hot = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features_hot,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

print(f'Beginning Training!')
current_time = datetime.now()
estimator_hot.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

test_input_fn = run_classifier.input_fn_builder(
    features=test_features_hot,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

estimator_hot.evaluate(input_fn=test_input_fn, steps=None)

def getPrediction_hot(in_sentences):
  labels = ["NOT_LIBOR", "LIBOR"]
  input_examples_hot = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features_hot = run_classifier.convert_examples_to_features(input_examples_hot, label_list_hot, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features_hot, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions_hot = estimator_hot.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions_hot)]


W0620 23:19:34.841624 4672591296 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14




In [12]:
from __future__ import print_function
import warnings
warnings.filterwarnings("ignore")
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import appmode
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import Output
from ipywidgets import Button, Layout
from IPython.display import Javascript
import pandas as pd
from IPython.display import display as display
from IPython.display import HTML
from ipywidgets import HBox, VBox
from collections import OrderedDict
pd.options.display.html.table_schema = True
pd.options.display.max_rows = None
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 300
pd.options.display.width = 600
pd.options.display.expand_frame_repr = True
import nltk
from nltk import SnowballStemmer
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from io import StringIO
import csv
import os
import os.path
import re
from bs4 import BeautifulSoup as BeautifulSoup
from openpyxl import Workbook as Workbook
import textract
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import svm, metrics
import googleapiclient.discovery
import sys
if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] =  '/Users/josiasdewey/Downloads/Google Vision-e79587625a9d.json'
service = googleapiclient.discovery.build('ml', 'v1')

#Uncomment the below code if it is necessary to run predictions locally.  

df_f = pd.read_excel('/Users/josiasdewey/Google_Drive/Project_Sunset/merged.xlsx')
df_f = df_f.sample(frac=1, axis=1).reset_index(drop=True)
df_f = df_f[pd.notnull(df_f['Text'])]
df_f = df_f.drop_duplicates('Text')
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(df_f['Text'], df_f['Label'], random_state = 0)
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
clf = LinearSVC()
pipeline2 = Pipeline([('count_vect', count_vect), ('tfidf_transformer', tfidf_transformer), ('clf', clf)])
model2 = pipeline2.fit(X_train_1, y_train_1)
labels2 = df_f.drop_duplicates('Label')
joblib.dump(pipeline2, 'model_type_libor.joblib')
!gsutil cp ./model.joblib gs://$BUCKET_NAME/model.joblib

'''
df = pd.read_excel('/Users/josiasdewey/Google_Drive/Project_Sunset/hotdog3_g.xlsx')
df = df.sample(frac=1, axis=1).reset_index(drop=True)
df = df[pd.notnull(df['Text'])]
df = df.drop_duplicates('Text')
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state = 0)
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
clf = LinearSVC()
pipeline = Pipeline([('count_vect', count_vect), ('tfidf_transformer', tfidf_transformer), ('clf', clf)])
model = pipeline.fit(X_train, y_train)
labels = df.drop_duplicates('Label')
joblib.dump(pipeline, 'model_libor.joblib')
'''

def html_from_file_no_tags(file_path):
    with open(file_path, 'rb') as myfile:
        raw_text = myfile.read()
    bsObj = BeautifulSoup(raw_text, 'lxml').text
    return bsObj

def remove_roman_numerals(text):
    clean = re.sub('\s*[ivx]+\s+', ' ', text)
    return clean

def normalize_sent(sent, lower=False, lemm=False):
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    sent = re.sub(r'[^a-zA-Z\s]', '', sent, re.I | re.A)
    sent = sent.strip()
    tokens = wpt.tokenize(sent)
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    if lemm == True:
        filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    sent = ' '.join(filtered_tokens)
    sent = remove_roman_numerals(sent)
    if lower == True:
        sent = sent.lower()
    return sent

def load_files_from_directory(path):
    list_of_text = []
    for file in os.listdir(path):
        filename = os.fsdecode(file)
        file_path = path + filename
        text = load_raw(file_path)
        text = str(text)
        list_of_text.append(text)

def load(path, tags=False):
    if path.endswith('html'):
        if (tags):
            return html_from_file_tags(path)
        else:
            return html_from_file_no_tags(path)
    elif path.endswith('.txt'):
        return str(text_from_file(path))
    else:
        text = text_from_binary(path)
        return text
        #except:
         #   print('Failed to load as binary. Try reader that accepts url as argument (e.g., html_from_web_tags(url) or html_from_web_no_tags(url)).')

def text_from_binary(file_path):
    text = textract.process(file_path, method='tesseract', language='eng')
    #text = 'Not set up for binary'
    return text.decode('utf-8')#, 'ignore').strip()

def html_from_file_tags(file_path):
    with open(file_path, 'rb') as myfile:
        raw_text = myfile.read()
    return raw_text

def text_from_file(file_path):
    with open(file_path, 'rb') as myfile:
        raw_text = myfile.read()
    return raw_text

def tokenize_and_stem(text):
    stemmer = SnowballStemmer("english")
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    joined_text = " ".join(stems)
    print(joined_text)
    return joined_text

def create_progress_bar():
    in_progress = widgets.IntProgress(
    value=0,
    min=0,
    max=10,
    step=1,
    visible=False,
    description='Status:',
    bar_style='', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
)
    return in_progress

def change_progress_bar(name, percent_complete):
    name.value = percent_complete

def is_progress_bar_visible(name, visible):
    name.visible = visible
    
def clear_output(name):
    name.close
    
def create_submit_button():
    button = widgets.Button(
    description='Submit',
    disabled=False
    )
    button.style.button_color = 'steelblue'
    button.layout={'border': '1px solid grey'}
    return button

def create_download_button():
    button = widgets.Button(
    description='Download Excel',
    disabled=False
    )
    button.style.button_color = 'Honeydew'
    button.layout={'border': '1px solid grey'}
    return button

def create_refresh_button():
    button = widgets.Button(
    description='Clear Output',
    disabled=False
    )
    button.style.button_color = 'LightSlateGray'
    button.layout={'border': '1px solid grey'}
    return button

#TODO Create seperate lists for individual items for other subject matters
#TODO Add training back-end

results = ''
out = widgets.Output(layout=Layout(width="400", height="400", border='1px solid grey'))
out_aux = widgets.Output()

model_selection = widgets.RadioButtons(
    options=['Bidirectional LTSM', 'Support Vector (SVC)', 'Gensim', 'MultinomialNB', 'Logistic Regression'],
     value='Support Vector (SVC)',
    description='Model:',
    disabled=False
)

Pre_Processing_selection_lower = widgets.Checkbox(
    value=False,
    description='Lowercase text',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='lowercase text',
    icon='check'
)

Pre_Processing_selection_lemm = widgets.Checkbox(
    value=False,
    description='Lemmatize tokens',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='lemmatize',
    icon='check'
)

Pre_Processing_selection_stop = widgets.Checkbox(
    value=False,
    description='Filter stop words',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='filter stop words',
    icon='check'
)

Pre_Processing_selection_small = widgets.Checkbox(
    value=False,
    description='Filter single char tokens',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Remove one chars',
    icon='check'
)

file_path_text = widgets.Text(              
    value='credit.txt',
    placeholder='filename',
    description='Filename:',
    disabled=False
)

directory_path = widgets.Text(
value='/Users/josiasdewey/Downloads/credit/',
placeholder='path',
description='Source Path:')

df_path = widgets.Text(
value='/Users/josiasdewey/Downloads/libor_2018_19.xlsx',
placeholder='/Users/josiasdewey/Downloads/libor_2018_19.xlsx',
description='Source Path:')

save_file_path = widgets.Text(              
    value='model.pickle',
    placeholder='Filename',
    description='Save:',
    disabled=False
)

model_training_material = widgets.Text(              
    value='training.xlsx',
    placeholder='File Path',
    description='Train Docs:',
    disabled=False
)

clause_selection = widgets.SelectMultiple(
    options=['All', 'MAC_definition', 'NOT_LIBOR', 'LIBOR', 'commitment_fee', 'change_control', 'change_control_prepayment', 'debt_issuance_prepayment', 'default_interest', 'disposition_assets_prepayment', 'equity_issuance_prepayment', 'eurodollar_rate', 'excess_cash_flow', 'fixed_charge_ratio', 'lender_inspection_rights', 'margin_leverage_ratio', 'margin_rating', 'monetary_default', 'net_worth', 'restricted_payments', 'Amendments_Consent', 'secured_facility', 'Amendments_Consent', 'unused_fee', 'voluntary_prepayment'],
    value=['All'],
    rows=10,
    description='Clause selection:',
    disabled=False
)

button = create_submit_button() 
download_button = create_download_button()
refresh_button = create_refresh_button()
tab1 = VBox(children=[HBox(children=[file_path_text, directory_path, df_path])])
tab2 = VBox(children=[HBox(children=[model_selection, VBox(children=[Pre_Processing_selection_lower, Pre_Processing_selection_stop, Pre_Processing_selection_small, Pre_Processing_selection_lemm]), clause_selection])])
tab3 = VBox(children=[HBox(children=[model_selection, model_training_material, save_file_path])])
tab = widgets.Tab(children=[tab2, tab1, tab3])
tab.set_title(0, 'Configure')
tab.set_title(1, 'Predict')
tab.set_title(2, 'Training')
HBox_classify = HBox(children=[button, refresh_button, download_button])
VBox_classify = VBox(children=[tab, HBox_classify])
VBox_classify.layout={'border': '.25px solid grey'}

#with out_aux:
display(VBox_classify)

returned_values = {} 

def clear_output():
    out.clear_output()

@download_button.on_click
def download_button_clicked(b):
    output = returned_values['df_output']
    writer = pd.ExcelWriter('classification_results.xlsx')
    output.to_excel(writer,'Sheet1')
    writer.save()
    download_button.description = 'Download complete'

@refresh_button.on_click
def refresh_button_clicked(b):
    out.clear_output()
    download_button.description = 'Download Excel'

@out.capture()
@button.on_click
def on_button_clicked(b):
#if 1==1:
    clear_output()
    download_button.description = 'Download Excel'
    in_progress = create_progress_bar()
    display(in_progress)
    change_progress_bar(in_progress, 1)
    input_path = '/Users/josiasdewey/Projects/test/'#Loan Agreement - NGP VI Greensboro (4827-4529-3719 v3).doc'
    doc_list=[]
    master = []
    master_raw = []
    category = []
    x = 0
    #completed_files = []
    #df_c = pd.read_excel('/Users/josiasdewey/Projects/hknotebook/outputs2741671584.html_134.xlsx')
    #for index, row in df_c.iterrows():
        #completed_files.append(df_c['file'][index])
    change_progress_bar(in_progress, 2)
    for file in os.listdir(input_path):
        filename = os.fsdecode(file)
        print(filename)
        file_path = input_path + filename
        print(file_path)
        change_progress_bar(in_progress, 3)
        if filename.endswith('docx'):# and filename not in completed_files:
            clean = []
            raw = []
            text = load(file_path)
            text = str(text)
            segment_sentences = sent_tokenize(text)
            change_progress_bar(in_progress, 4)
            for sent in segment_sentences:
                tmp = []
                cleaned = normalize_sent(sent)
                tmp.append(cleaned)
                name = 'projects/{}/models/{}/versions/{}'.format('elite-mix-229322', 'LIBOR23', 'v01', cache_discovery=False)
                response = service.projects().predict(
                name=name,
                body={'instances': tmp}
                ).execute()
                change_progress_bar(in_progress, 5)
                print(response['predictions'][0])
                if response['predictions'][0] == 'LIBOR':
                    print(sent)
                    raw.append(sent)
                    clean.append(cleaned)
            #pred_sentences = sent_tokenize(text)
            if 1 == 1:
                change_progress_bar(in_progress, 6)
                print(len(raw))
                if 1 == 1:
                    predictions = getPrediction_hot(raw)
                    print(len(predictions))
                    text_list = []
                    cleaned_list = []
                    change_progress_bar(in_progress, 7)
                    for pred in predictions:
                        if pred[2] == 'LIBOR' and len(pred[0]) > 100:
                            #print(pred[1], pred[2], raw[i])
                            print(pred[0])
                            print(pred)
                            text_list.append(pred[0])
                            cleaned_list.append(normalize_sent(pred[0]))
                        change_progress_bar(in_progress, 8)
                    raw_text = ''.join(text_list)
                    raw_text = raw_text.split()
                    raw_text = ' '.join(raw_text)
                    hand_off = ''.join(cleaned_list)
                    #print(hand_off)
                    master.append(hand_off)
                    master_raw.append(raw_text)
                    arrayify = [hand_off]
                    #arrayify.append(hand_off)
                    change_progress_bar(in_progress, 8)
                    pred2 = model2.predict(arrayify)
                    print(pred2[0])
                    category.append(pred2[0])
                    change_progress_bar(in_progress, 9)
                    doc_list.append(filename)
                    df = pd.DataFrame()                                                           
                    df['text'] = master
                    df['raw'] = master_raw
                    df['file'] = doc_list
                    df['category'] = category
                    df.to_excel('outputs' + filename + '_' + str(x) + '.xlsx')
                    change_progress_bar(in_progress, 10)
                #except Exception as e: 
                    #print(e)
                x=x+1
    df_output = df
    returned_values['df_output'] = df_output 
    in_progress.close()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
    with out:
        df_output
out_aux
out



CommandException: "cp" command does not support provider-only URLs.


VBox(children=(Tab(children=(VBox(children=(HBox(children=(RadioButtons(description='Model:', index=1, options…

Output(layout=Layout(border='1px solid grey', height='400', width='400'))

In [7]:


def create_progress_bar():
    in_progress = widgets.IntProgress(
    value=0,
    min=0,
    max=10,
    step=1,
    visible=False,
    description='Status:',
    bar_style='', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
    )
    return in_progress

def change_progress_bar(name, percent_complete):
    name.value = percent_complete

def is_progress_bar_visible(name, visible):
    name.visible = visible
    
def clear_output(name):
    name.close
    
def create_submit_button():
    button = widgets.Button(
    description='Submit',
    disabled=False
    )
    button.style.button_color = 'steelblue'
    button.layout={'border': '1px solid grey'}
    return button

def create_download_button():
    button = widgets.Button(
    description='Download Excel',
    disabled=False
    )
    button.style.button_color = 'Honeydew'
    button.layout={'border': '1px solid grey'}
    return button

def create_refresh_button():
    button = widgets.Button(
    description='Clear Output',
    disabled=False
    )
    button.style.button_color = 'LightSlateGray'
    button.layout={'border': '1px solid grey'}
    return button

#TODO Create seperate lists for individual items for other subject matters
#TODO Add training back-end

results = ''
out = widgets.Output(layout=Layout(width="400", height="400", border='1px solid grey'))
out_aux = widgets.Output()

model_selection = widgets.RadioButtons(
    options=['Bidirectional LTSM', 'Support Vector (SVC)', 'Gensim', 'MultinomialNB', 'Logistic Regression'],
     value='Support Vector (SVC)',
    description='Model:',
    disabled=False
)

Pre_Processing_selection_lower = widgets.Checkbox(
    value=False,
    description='Lowercase text',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='lowercase text',
    icon='check'
)

Pre_Processing_selection_lemm = widgets.Checkbox(
    value=False,
    description='Lemmatize tokens',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='lemmatize',
    icon='check'
)

Pre_Processing_selection_stop = widgets.Checkbox(
    value=False,
    description='Filter stop words',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='filter stop words',
    icon='check'
)

Pre_Processing_selection_small = widgets.Checkbox(
    value=False,
    description='Filter single char tokens',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Remove one chars',
    icon='check'
)

file_path_text = widgets.Text(              
    value='credit.txt',
    placeholder='filename',
    description='Filename:',
    disabled=False
)

directory_path = widgets.Text(
value='/Users/josiasdewey/Downloads/credit/',
placeholder='path',
description='Source Path:')

df_path = widgets.Text(
value='/Users/josiasdewey/Downloads/libor_2018_19.xlsx',
placeholder='/Users/josiasdewey/Downloads/libor_2018_19.xlsx',
description='Source Path:')

save_file_path = widgets.Text(              
    value='model.pickle',
    placeholder='Filename',
    description='Save:',
    disabled=False
)

model_training_material = widgets.Text(              
    value='training.xlsx',
    placeholder='File Path',
    description='Train Docs:',
    disabled=False
)

clause_selection = widgets.SelectMultiple(
    options=['All', 'MAC_definition', 'NOT_LIBOR', 'LIBOR', 'commitment_fee', 'change_control', 'change_control_prepayment', 'debt_issuance_prepayment', 'default_interest', 'disposition_assets_prepayment', 'equity_issuance_prepayment', 'eurodollar_rate', 'excess_cash_flow', 'fixed_charge_ratio', 'lender_inspection_rights', 'margin_leverage_ratio', 'margin_rating', 'monetary_default', 'net_worth', 'restricted_payments', 'Amendments_Consent', 'secured_facility', 'Amendments_Consent', 'unused_fee', 'voluntary_prepayment'],
    value=['All'],
    rows=10,
    description='Clause selection:',
    disabled=False
)

button = create_submit_button() 
download_button = create_download_button()
refresh_button = create_refresh_button()
tab1 = VBox(children=[HBox(children=[file_path_text, directory_path, df_path])])
tab2 = VBox(children=[HBox(children=[model_selection, VBox(children=[Pre_Processing_selection_lower, Pre_Processing_selection_stop, Pre_Processing_selection_small, Pre_Processing_selection_lemm]), clause_selection])])
tab3 = VBox(children=[HBox(children=[model_selection, model_training_material, save_file_path])])
tab = widgets.Tab(children=[tab2, tab1, tab3])
tab.set_title(0, 'Configure')
tab.set_title(1, 'Predict')
tab.set_title(2, 'Training')
HBox_classify = HBox(children=[button, refresh_button, download_button])
VBox_classify = VBox(children=[tab, HBox_classify])
VBox_classify.layout={'border': '.25px solid grey'}
VBox_classify

with out_aux:
    display(VBox_classify)

returned_values = {} 

def clear_output():
    out.clear_output()

@download_button.on_click
def download_button_clicked(b):
    output = returned_values['df_output']
    writer = pd.ExcelWriter('classification_results.xlsx')
    output.to_excel(writer,'Sheet1')
    writer.save()
    download_button.description = 'Download complete'

@refresh_button.on_click
def refresh_button_clicked(b):
    out.clear_output()
    download_button.description = 'Download Excel'

In [8]:
clause_selection

SelectMultiple(description='Clause selection:', index=(0,), options=('All', 'MAC_definition', 'NOT_LIBOR', 'LI…

In [6]:
VBox_classify

VBox(children=(Tab(children=(VBox(children=(HBox(children=(RadioButtons(description='Model:', index=1, options…