## Install Packages

In [1]:
# !pip install pandas
# !pip install python-docx
# !pip install spacy
# !python -m spacy download en
# !pip install -q python-terrier
# !pip install import-ipynb

## 
## Import Packages

In [2]:
import pandas as pd

import docx
from docx import Document

import re
import spacy
nlp = spacy.load("en_core_web_sm")

import os

import pyterrier as pt

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import lexnlp.extract.en.definitions as definitions

import pandas.io.formats.style
from pandas import ExcelWriter
import numpy as np

from xlsxwriter.utility import xl_rowcol_to_cell

from datetime import datetime

[nltk_data] Downloading package punkt to /Users/mojojojo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mojojojo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 
## Common functions for input processing

In [3]:
def preprocessing_master(master_df):
    df = master_df.copy()
    
    df["sub_category"] = df["sub_category"].str.replace('\n', '')
    df["clause"] = df["clause"].str.replace('\n', ' ')
    df = df.replace(r'[^0-9a-zA-Z ]', ' ', regex=True)
    df = df[df['clause'] != '']
    df.reset_index(drop=True, inplace=True)

    df['tokens'] = df['clause'].apply(tokenize)

    return df 


def tokenize(text):
    token_texts = []
    tokens = nlp(text)
    for token in tokens:
        if not token.is_stop and not token.is_punct and not token.is_space:
            token_texts.append(token.lemma_.lower())
    return " ".join(token_texts)


def preprocessing_contract(df):
    contract_df = df.copy()
    
    contract_df = contract_df[contract_df.clause.str.split().str.len() > 5]
    contract_df["clause"] = contract_df["clause"].str.replace('\n', ' ')
    contract_df = contract_df.replace(r'[^a-zA-Z ]', ' ', regex=True)
    contract_df = contract_df[contract_df['clause'] != '']
    contract_df = contract_df[~contract_df['clause'].str.isupper()]

    contract_df['tokens'] = contract_df['clause'].apply(tokenize)
    
    contract_df = contract_df[contract_df.tokens.str.split().str.len() > 5]
    
    contract_df.reset_index(drop=True, inplace=True)
    
    return contract_df


def remove_definitions(df):
    clean_df = df.copy()
    
    drop_index = []
    for index, row in df.iterrows():
        if len(list(definitions.get_definitions(row['clause']))) > 0:
            drop_index.append(index)
    
    if drop_index:
        clean_df = df.drop(drop_index)
        clean_df.reset_index(drop=True, inplace=True)
    
    return clean_df


def remove_signature(df):
    clean_df = df.copy()
    
    drop_index = -1
    for index, row in df.iterrows():
        if row['clause'].lower().startswith('in witness ') or 'WITNESS ' in row['clause'] or 'signature ' in row['clause'].lower() or 'signatures ' in row['clause'].lower():
            drop_index = index
            break
    
    if drop_index != -1:
        drop_indices = list(range(drop_index, df.index[-1]))
        clean_df = df.drop(drop_indices)
        clean_df.reset_index(drop=True, inplace=True)
    
    return clean_df


def read_contract(filename):
    contract_doc = Document(filename)

    record = []
    for para in contract_doc.paragraphs:
        if not para.text.lower().startswith("ex-"):
            record.append([para.text])

    contract_df = pd.concat([pd.DataFrame(columns=['clause']), 
                             pd.DataFrame(record, columns=['clause'])], ignore_index=True)
    
    contract_df = remove_definitions(contract_df)
    
    contract_df = remove_signature(contract_df)
    
    return preprocessing_contract(contract_df)

    
def print_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1):
        display(df)
        
        
CUAD_to_kronicle = {'Governing Law': 'Governing Law', 
                    'Non Compete': 'Non Compete Clause', 
                    'No Solicit Of Customers': 'Non Solicitation',
                    'Termination For Convenience': 'Termination for Convenience',
                    'Post Termination Services': 'Effects of Termination',
                    'Anti Assignment': 'Anti Assignment',
                    'Audit Rights': 'Right to Audit',
                    'Cap On Liability': 'Limitation to Liability',
                   }

kronicle_to_CUAD = {'Termination for Convenience': 'Termination For Convenience',
                    'Effects of Termination': 'Post Termination Services',
                    'Limitation to Liability': 'Cap On Liability',
                    'Anti Assignment': 'Anti Assignment',
                    'Governing Law': 'Governing Law',
                    'Right to Audit': 'Audit Rights',
                    'Non Compete Clause': 'Non Compete',
                    'Non Solicitation': 'No Solicit Of Customers'
                   }

## 
## Importing Standard Clauses (Master Document)

In [4]:
std_cls_doc = Document("standard_clauses.docx")

std_cls_df = pd.DataFrame(columns=['category','sub_category','clause'])

doc_columns = ['sno', 'category','buy_cls','sell_cls']
doc_df = pd.DataFrame(columns=doc_columns)

for table in std_cls_doc.tables:
    for row in table.rows:
        record = [[value.text for value in row.cells]]
        doc_df = pd.concat([doc_df, pd.DataFrame(record, columns=doc_columns)], ignore_index=True)

doc_df = doc_df.drop([0, 1])
doc_df.reset_index(drop=True, inplace=True)

doc_df = doc_df.replace('“', '"', regex=True)
doc_df = doc_df.replace('”', '"', regex=True)

In [5]:
std_cls_df = pd.DataFrame(columns=['category','sub_category','clause'])

for index, row in doc_df.iterrows():
    # processing BUY side clauses
    buy = row['buy_cls']
    buy_sub_category = re.findall('"([^"]*)"', buy)
    buy_record = []
    
    # handle empty rows
    if not buy_sub_category:
        buy_record.append([row['category'], row['category'], buy])
    else:
        if len(buy_sub_category) == 1:
            buy_record.append([row['category'], buy_sub_category[0], " ".join(buy.split('\n\n')[1:])])
        else:
            clauses = buy.split("\n\n")
            if len(clauses) == len(buy_sub_category):
                for i in range(len(clauses)):
                    buy_record.append([row['category'], buy_sub_category[i], clauses[i]])
            else:
                for i in range(0, len(clauses)):
                    text = clauses[i]
                    if text.endswith('"') and text.startswith('"'):
                        idx = i+1
                        full_cls = []
                        while idx < len(clauses) and not clauses[idx].endswith('"') and not clauses[idx].startswith('"'):
                            full_cls.append(clauses[idx])
                            idx+=1
                        
                        buy_record.append([row['category'], text.replace('"', ''), " ".join([val for val in full_cls])])
                    
                    elif text.startswith('"') and not text.endswith('"'):
                        sub_category = re.findall('"([^"]*)"', text)
                        buy_record.append([row['category'], sub_category[0], text])
                        
    std_cls_df = pd.concat([std_cls_df, pd.DataFrame(buy_record, columns=std_cls_df.columns)], ignore_index=True)
    
    
    # processing SELL side clauses
    sell = row['sell_cls']
    sell_sub_category = re.findall('"([^"]*)"', sell)
    sell_record = []
    
    # handle empty rows
    if not sell_sub_category:
        sell_record.append([row['category'], row['category'], sell])
    else:
        if len(sell_sub_category) == 1:
            sell_record.append([row['category'], sell_sub_category[0], " ".join(sell.split('\n\n')[1:])])
        else:
            clauses = sell.split("\n\n")
            if len(clauses) == len(sell_sub_category):
                for i in range(len(clauses)):
                    sell_record.append([row['category'], sell_sub_category[i], clauses[i]])
            else:
                for i in range(0, len(clauses)):
                    text = clauses[i]
                    if text.endswith('"') and text.startswith('"'):
                        idx = i+1
                        full_cls = []
                        while idx < len(clauses) and not clauses[idx].endswith('"') and not clauses[idx].startswith('"'):
                            full_cls.append(clauses[idx])
                            idx+=1
                        
                        sell_record.append([row['category'], text.replace('"', ''), " ".join([val for val in full_cls])])
                    
                    elif text.startswith('"') and not text.endswith('"'):
                        sub_category = re.findall('"([^"]*)"', text)
                        sell_record.append([row['category'], sub_category[0], text])
    
    std_cls_df = pd.concat([std_cls_df, pd.DataFrame(sell_record, columns=std_cls_df.columns)], ignore_index=True)


std_cls_df = preprocessing_master(std_cls_df)

## 
## CUAD Standard Clauses (Master Document)

In [22]:
cuad_std_cls_df = pd.read_excel('CUAD_clauses.xlsx', index_col=None)
cuad_std_cls_df['category'] = cuad_std_cls_df['category'].replace(r'[^a-zA-Z ]', ' ', regex=True)
cuad_std_cls_df = cuad_std_cls_df[cuad_std_cls_df['category'].isin(list(CUAD_to_kronicle.keys()))]
cuad_std_cls_df.reset_index(drop=True, inplace=True)

Unnamed: 0,category,question,kronicle
0,Governing Law,"Highlight the parts (if any) of this contract related to ""Governing Law"" that should be reviewed by a lawyer. Details: Which state/country\s law governs the interpretation of the contract?",Governing Law
1,Non Compete,"Highlight the parts (if any) of this contract related to ""Non-Compete"" that should be reviewed by a lawyer. Details: Is there a restriction on the ability of a party to compete with the counterparty or operate in a certain geography or business or technology sector?\xa0",Non Compete Clause
2,No Solicit Of Customers,"Highlight the parts (if any) of this contract related to ""No-Solicit Of Customers"" that should be reviewed by a lawyer. Details: Is a party restricted from contracting or soliciting customers or partners of the counterparty whether during the contract or after the contract ends (or both)?",Non Solicitation
3,Termination For Convenience,"Highlight the parts (if any) of this contract related to ""Termination For Convenience"" that should be reviewed by a lawyer. Details: Can a party terminate this\xa0 contract without cause (solely by giving a notice and allowing a waiting\xa0 period to expire)?",Termination for Convenience
4,Anti Assignment,"Highlight the parts (if any) of this contract related to ""Anti-Assignment"" that should be reviewed by a lawyer. Details: Is consent or notice required of a party if the contract is assigned to a third party?",Anti Assignment
5,Post Termination Services,"Highlight the parts (if any) of this contract related to ""Post-Termination Services"" that should be reviewed by a lawyer. Details: Is a party subject to obligations after the termination or expiration of a contract including any post-termination transition payment transfer of IP wind-down last-buy or similar commitments?",Effects of Termination
6,Audit Rights,"Highlight the parts (if any) of this contract related to ""Audit Rights"" that should be reviewed by a lawyer. Details: Does a party have the right to\xa0 audit the books records or physical locations of the counterparty to ensure compliance with the contract?",Right to Audit
7,Cap On Liability,"Highlight the parts (if any) of this contract related to ""Cap On Liability"" that should be reviewed by a lawyer. Details: Does the contract include a cap on liability upon the breach of a party’s obligation? This includes time limitation for the counterparty to bring claims or maximum amount for recovery.",Limitation to Liability


## 
## Importing Contract Clauses (Test Document)

In [7]:
contract_df1 = read_contract("contracts/000000.SOG.Goods.MasterSupplyAgmt.docx")
contract_df2 = read_contract("contracts/000000.SOG.WaterTreatmentEquipment.SupplyAgreement.docx")
contract_df3 = read_contract("contracts/000323.SOG.LowAlkaliProduct.SupplyAgreement.docx")
contract_df4 = read_contract("contracts/010400.SOG.CoffeeProducts.SupplyAgmt.docx")
contract_df5 = read_contract("contracts/021024.SOG.Tiles.SupplyAgreement.docx")

Unnamed: 0,clause,tokens
0,WHEREAS Buyer and Seller desire to have Seller provide certain goods and services to Buyer pursuant to the terms and conditions of this Agreement,buyer seller desire seller provide certain good service buyer pursuant term condition agreement
1,NOW THEREFORE in consideration of the mutual covenants contained herein and other good and valuable consideration the Parties hereby agree as follows,consideration mutual covenant contain good valuable consideration party agree follow
2,a Buyer means Engility Corporation and its affiliates as applicable,buyer mean engility corporation affiliate applicable
3,b Effective Date means the date this Agreement becomes fully binding and enforceable and shall be the same as the Effective Time,b effective date mean date agreement fully binding enforceable shall effective time
4,e New Work means any contemplated agreement for the sale of goods and services between Buyer and Seller which is not a part of the Existing Work and which is a Teaming Commitment,e new work mean contemplated agreement sale good service buyer seller existing work teaming commitment
5,f Seller means L Communications Corporation and its affiliates as applicable,f seller mean l communications corporation affiliate applicable
6,g Teaming Agreement means the agreement substantially in the form of Appendix D hereto which provides in part a binding commitment between Buyer and Seller to either exclusively or non exclusively pursue the capture of certain programs contemplated to be awarded by Buyer s customers,g teaming agreement mean agreement substantially form appendix d hereto provide binding commitment buyer seller exclusively non exclusively pursue capture certain program contemplate award buyer s customer
7,h Teaming Commitment means each of the business pursuits expressly set forth in Appendix C hereto wherein the parties have agreed to enter or have entered into either an exclusive or non exclusive teaming agreement such exclusivity is as set forth within Appendix C wherein the Parties have agreed to work together to capture a program contract award from Buyer s customer s,h team commitment mean business pursuit expressly set forth appendix c hereto party agree enter enter exclusive non exclusive team agreement exclusivity set forth appendix c party agree work capture program contract award buyer s customer s
8,General Subject to the terms and conditions of this Agreement Buyer and Seller agree to the sale of goods and services for Existing Work and New Work and to enter into certain Teaming Commitments,general subject term condition agreement buyer seller agree sale good service exist work new work enter certain teaming commitments
9,a Within days after the Effective Date or such other period of time as the Parties may otherwise agree in writing Buyer shall issue to Seller a purchase order or an amendment thereto for each of the Existing Work as set forth in Appendix A The terms and conditions of such purchase order shall be based on i the terms and conditions contained in the IDWA or other agreement for Existing Work between the Parties as of the Effective Time ii the terms and conditions required to be flowed down from Buyer s prime contract and iii this Agreement and any other terms and conditions mutually agreed to by the Parties In the event the Parties do not agree on such other terms and conditions referred to in clause iii of the preceding sentence the terms and conditions contained in the General Terms and Conditions of Purchase attached as Appendix B hereto will apply The terms and conditions of Article a i and a ii above shall take precedence over the terms set forth in Appendix B,day effective date period time parties agree write buyer shall issue seller purchase order amendment thereto existing work set forth appendix term condition purchase order shall base term condition contain idwa agreement exist work parties effective time ii term condition require flow buyer s prime contract iii agreement term condition mutually agree party event party agree term condition refer clause iii precede sentence term condition contain general terms condition purchase attach appendix b hereto apply term condition article ii shall precedence term set forth appendix b


In [8]:
contract_df6 = read_contract("contracts/181213.SOG.SaleOfRehabilitationRobots.SaleOfGoodsAgmt.docx")
contract_df7 = read_contract("contracts/190207.SOG.Products.PurchaseAgmt.docx")
contract_df8 = read_contract("contracts/191101.SOG.FoodServiceMarkets.SupplyAgmt.docx")
contract_df9 = read_contract("contracts/191120.SOG.HempExtracts.SupplyAgmt.docx")
contract_df10 = read_contract("contracts/191127.SOG.HempBiomass.SupplyAgmt.docx")

In [9]:
contract_df11 = read_contract("contracts/200203.SOG.PharmaceuticalProducts.SupplyAgmt.docx")
contract_df12 = read_contract("contracts/200817.SOG.COV-19Vaccines.SupplyAgmt.docx")
contract_df13 = read_contract("contracts/201500.SOG.SoftwareProducts.SupplyAgmt.docx")
contract_df14 = read_contract("contracts/980810.SOG.FlashMemoryDevice.MasterPurchaseAgreement.docx")
contract_df15 = read_contract("contracts/SOG.Equipment.AgreementforSaleofEquipment.docx")

In [10]:
contract_df16 = read_contract("contracts/000000.SOG.Goods.MasterSupplyAgmt.docx")
contract_df17 = read_contract("contracts/000000.SOG.WaterTreatmentEquipment.SupplyAgreement.docx")
contract_df18 = read_contract("contracts/000323.SOG.LowAlkaliProduct.SupplyAgreement.docx")
contract_df19 = read_contract("contracts/010400.SOG.CoffeeProducts.SupplyAgmt.docx")
contract_df20 = read_contract("contracts/021024.SOG.Tiles.SupplyAgreement.docx")

In [11]:
contract_df21 = read_contract("contracts/000000.SOG.Goods.MasterSupplyAgmt.docx")
contract_df22 = read_contract("contracts/000000.SOG.WaterTreatmentEquipment.SupplyAgreement.docx")
contract_df23 = read_contract("contracts/000323.SOG.LowAlkaliProduct.SupplyAgreement.docx")
contract_df24 = read_contract("contracts/010400.SOG.CoffeeProducts.SupplyAgmt.docx")
contract_df25 = read_contract("contracts/021024.SOG.Tiles.SupplyAgreement.docx")

In [12]:
contract_df26 = read_contract("contracts/000000.SOG.Goods.MasterSupplyAgmt.docx")
contract_df27 = read_contract("contracts/000000.SOG.WaterTreatmentEquipment.SupplyAgreement.docx")
contract_df28 = read_contract("contracts/000323.SOG.LowAlkaliProduct.SupplyAgreement.docx")
contract_df29 = read_contract("contracts/010400.SOG.CoffeeProducts.SupplyAgmt.docx")
contract_df30 = read_contract("contracts/021024.SOG.Tiles.SupplyAgreement.docx")

## 
## Readying Pyterrier environment

In [13]:
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk-11.0.14.jdk/Contents/Home"

if not pt.started():
    pt.init()
    
assert pt.__version__ > '0.8', "PyTerrier 0.8.0 or newer is required"

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


## 
## Common functions for model running and results processing

In [14]:
top_categories = {'Indemnification': ['Indemnity', 'Intellectual Property Rights Indemnity'], 
                  'Termination': ['Termination for Cause', 'Termination for Convenience', 'Termination for Material Breach'], 
                  'Effects of Termination': ['Effects of Termination'], 
                  'Limitation of Liability': ['Limitation on Amount', 'Limitation to Liability'], 
                  'Disclaimer': ['DISCLAIMER'], 
                  'Confidential Information': ['Confidential Information'], 
                  'Intellectual Property Rights': ['Intellectual Property Rights'], 
                  'Operative Clauses': ['Late Delivery and Penalty', 'Performance Guarantee', 'Insurance']
                 }

top_sub_categories = ['Indemnity', 'Intellectual Property Rights Indemnity', 
                     'Termination for Cause', 'Termination for Convenience', 'Termination for Material Breach', 
                     'Effects of Termination', 
                     'Limitation on Amount', 'Limitation to Liability', 
                     'DISCLAIMER', 
                     'Confidential Information', 
                     'Intellectual Property Rights', 
                     'Late Delivery and Penalty', 'Performance Guarantee', 'Insurance']


def get_missing_categories(df, groups):
    return set(df.docno).difference(set(groups.groups.keys()))


def get_top_missing_categories(df, groups):
    all_categories = get_missing_categories(df, groups)
    
    missing_df = pd.DataFrame(columns=['category', 'sub_category'])
    record = []
    
    for category in all_categories:
        for key, value in top_categories.items():
            if category in value:
                record.append([key, category])
    
    missing_df = pd.concat([missing_df, pd.DataFrame(record, columns=missing_df.columns)], ignore_index=True)
    return missing_df


def missing_category_results(master_df, index, queries):
    missing = {}
    for key, value in similarity_models(index, queries).items():
        df = value.copy()
        df = df[df['rank'] < 5]
        missing[key] = get_top_missing_categories(master_df, df.groupby(by='docno'))
        missing[key].columns = pd.MultiIndex.from_product([[key], missing[key].columns])
    
    return pd.concat(missing.values(), axis = 1)


def similarity_model(index, weighting_model, queries):
    batch_ret = pt.BatchRetrieve(index, wmodel=weighting_model)
    results = batch_ret.transform(queries)
    return results


def create_queries(contract_df):
    queries_df = contract_df.copy()
    queries_df['qid'] = ['q' + str(i+1) for i in queries_df.index.values]
    queries_df = queries_df.rename(columns={"tokens": "query"})
    return queries_df


def convert(number):  
    return str(round(number, 2))


def join(value):
    df = value.copy()
    df['score'] = df['score'].apply(convert)
    df = df.apply(' : '.join, axis=1).reset_index()
    df.rename({0: 'docno'}, inplace=True, axis='columns')
    return ' \n '.join(df['docno'])


def model_results(groups, result, model):
    record = []
    for group in groups.groups:
        df = groups.get_group(group)
        df = df.drop_duplicates(subset=['docno'])
        df = df.head()
        df = df.drop(columns=['docid', 'rank'])
        df = df.groupby(by=['qid', 'clause', 'query'])[['docno', 'score']].apply(join).reset_index()
        df.rename({0: 'docno'}, inplace=True, axis='columns')

        for index, row in df.iterrows():
            record.append([row['qid'], row['clause'], row['docno']])

    measure = pd.DataFrame(record, columns=['clauseid', 'clause', model])
    results = result.merge(measure, how='outer', on=["clauseid", 'clause'])
    return results


def run_queries(index, queries):
    results = pd.DataFrame(columns=['clauseid', 'clause'])
    for key, value in similarity_models(index, queries).items():
        results = model_results(value.groupby(by='qid'), results, key)

    return results


def similarity_models(index, queries, measures=['TF_IDF', 'BM25', 'PL2', 'Hiemstra_LM']):
    result_measures = {}
    for measure in measures:
        result_measures[measure] = similarity_model(index, measure, queries)
        
    return result_measures


def categories_to_excel(filename, sheetname, results): 
    writer = pd.ExcelWriter(filename, engine='xlsxwriter')
    results.to_excel(writer, sheet_name=sheetname, startrow=1 , startcol=0, header=False, index=False, encoding='utf8')  
    workbook  = writer.book
    worksheet = writer.sheets[sheetname]

    format_header = workbook.add_format()
    format_header.set_align('center')
    format_header.set_bold()
    format_header.set_text_wrap()
    format_header.set_border()

    format_data = workbook.add_format()
    format_data.set_text_wrap()

    worksheet.set_column('A:A', 10, format_data)
    worksheet.set_column('B:F', 40, format_data)
    worksheet.set_row(1, 150, format_data)
    worksheet.set_row(0, 20, format_header)

    # Write the header manually
    for colx, value in enumerate(results.columns.values):
        worksheet.write(0, colx, value)

    writer.save()

## Readying queries

In [15]:
queries1 = create_queries(contract_df1)
queries2 = create_queries(contract_df2)
queries3 = create_queries(contract_df3)
queries4 = create_queries(contract_df4)
queries5 = create_queries(contract_df5)

queries6 = create_queries(contract_df6)
queries7 = create_queries(contract_df7)
queries8 = create_queries(contract_df8)
queries9 = create_queries(contract_df9)
queries10 = create_queries(contract_df10)

queries11 = create_queries(contract_df11)
queries12 = create_queries(contract_df12)
queries13 = create_queries(contract_df13)
queries14 = create_queries(contract_df14)
queries15 = create_queries(contract_df15)

queries16 = create_queries(contract_df16)
queries17 = create_queries(contract_df17)
queries18 = create_queries(contract_df18)
queries19 = create_queries(contract_df19)
queries20 = create_queries(contract_df20)

queries21 = create_queries(contract_df21)
queries22 = create_queries(contract_df22)
queries23 = create_queries(contract_df23)
queries24 = create_queries(contract_df24)
queries25 = create_queries(contract_df25)

queries26 = create_queries(contract_df26)
queries27 = create_queries(contract_df27)
queries28 = create_queries(contract_df28)
queries29 = create_queries(contract_df29)
queries30 = create_queries(contract_df30)

## 
## Methods to run CUAD Model

In [16]:
# !git clone https://github.com/TheAtticusProject/cuad.git
# !mv cuad cuad-training
# !unzip cuad-training/data.zip -d cuad-data/
# !mkdir cuad-models
# !curl https://zenodo.org/record/4599830/files/roberta-base.zip?download=1 --output cuad-models/roberta-base.zip
# !unzip cuad-models/roberta-base.zip -d cuad-models/
# !pip install torch
# !pip install transformers

### Note
To access the cuad-models folder, uncomment and run the above cell.
In case the curl command fails, then manually download the file from the URL and run rest of the commands.

In [17]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

from scripts.predict import run_prediction

In [18]:
model = AutoModelForQuestionAnswering.from_pretrained('./cuad-models/roberta-base/')
tokenizer = AutoTokenizer.from_pretrained('./cuad-models/roberta-base/', use_fast=False)

In [19]:
def run_queries_cuad(index, queries):
    results = pd.DataFrame(columns=['clauseid', 'clause'])
    for key, value in similarity_models(index, queries).items():
        results = model_results_cuad(value.groupby(by='qid'), results, key)

    return results


def join_cuad(value):
    return ', '.join(value['docno'])


def model_results_cuad(groups, result, model):
    record = []
    for group in groups.groups:
        df = groups.get_group(group)
        df = df.drop_duplicates(subset=['docno'])
        df = df.head()
        df = df.drop(columns=['docid', 'rank'])
        df = df.groupby(by=['qid', 'clause', 'query'])[['docno']].apply(join_cuad).reset_index()
        df.rename({0: 'docno'}, inplace=True, axis='columns')

        for index, row in df.iterrows():
            record.append([row['qid'], row['clause'], row['docno']])

    measure = pd.DataFrame(record, columns=['clauseid', 'clause', model])
    results = result.merge(measure, how='outer', on=["clauseid", 'clause'])
    return results


def filter_categories(index, queries):
    filter_df = run_queries_cuad(index, queries)
    filter_df.drop(columns=['TF_IDF', 'BM25', 'Hiemstra_LM'], inplace=True)
    filter_df['PL2_cleaned'] = filter_df['PL2'].replace(r'[^a-zA-Z ]', '', regex=True)

    pattern = '|'.join(list(kronicle_to_CUAD.keys()))
    filter_df = filter_df[filter_df['PL2_cleaned'].str.contains(pattern)]
    
    filter_df.drop(columns=['PL2_cleaned'], inplace=True)
    filter_df.reset_index(drop=True, inplace=True)
    return filter_df
    
    
def get_questions(df):
    questions = {}
    for index, row in df.iterrows():
        categories = row['PL2'].split(',')

        category = []
        for cat in categories:
            temp = re.sub(r'[^A-Za-z ]+', '', cat).strip()
            ques = cuad_std_cls_df[cuad_std_cls_df['kronicle'] == temp]
            if len(ques) > 0:
                category.append(ques.iloc[0]['question'])

        qid = row['clauseid']
        if len(category) > 0:
            questions[qid] = category

    return questions


def get_predictions(df):
    questions = get_questions(df)
    
    predictions = {}
    for key, val in questions.items():
        
        now = datetime.now()
        query_start_time = now.strftime("%H:%M:%S")
        print("Query Start Time =", query_start_time)

        query = df.loc[df['clauseid'] == key, 'clause'].iloc[0]
        predictions[key] = run_prediction(questions, query, 'cuad-models/roberta-base/')

        now = datetime.now()
        query_end_time = now.strftime("%H:%M:%S")
        print("Query End Time =", query_end_time)
        print()
        
    return predictions