In [1]:
import torch
import transformers
from transformers import AutoTokenizer
from  langchain import LLMChain, HuggingFacePipeline, PromptTemplate
import pandas as pd
import guidance

In [2]:
df = pd.read_csv('../data/filtered_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pdf_link,summary,text_extracted,text_extracted_len,summary_len,summary_sentences_len,text_extracted_sentences_len
0,0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,1 | MARC J. FAGEL (Cal. Bar No. 154425) CARY S...,2125,392,20,201.0
1,1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,1 of 10 \nCase 9:08-cv-81565-DTKH Document 1 E...,3169,621,37,218.0
2,2,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE TE...,2433,459,19,193.0
3,3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...,7015,494,22,491.0
4,4,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,10f6 \n08-61524-CIV-DIMITROULEAS/ROSENBAUM \nU...,1270,356,16,107.0


In [3]:
df1 = pd.read_csv('../data/sample_input_for_checker1.csv')
df1.head()

Unnamed: 0.1,Unnamed: 0,summary,text
0,0,CORRECTEDThe Securities and Exchange Commissio...,-4. COMPLAINT SEC V. ABED ET AL. \n22. On July...
1,1,"technology company Genesis Microchip, Inc. wit...",STM’s securities are registered with the Commi...
2,2,"The Commission alleges that Elias Antoun, who ...",This case involves unlawful insider trading in...
3,3,The SEC also charged Antoun's childhood friend...,"Defendants Antoun and Abed, directly or | indi..."
4,4,"Both Antoun and Abed, who netted profits of ap...",The letter proposed \n| that STM would acquire...


In [4]:
model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model)

In [5]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    device_map=0,
    temperature=0.1
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
llm = HuggingFacePipeline(pipeline = pipeline)

In [25]:
options = ['True', 'False']
template = """

You are a compliance officer who works at a financial institution. You will be provided with a summary sentence and a set of source sentences. 
Check if the summary sentence is a good summary of the source sentences from Named Entity and Named Entity Relationship perspectives.
Please answer either "True" or "False" only, explaination is not needed.

Source sentences: ```{source}```
Summary sentence: ```{summary}```

Final Answer (True/False only):
           """

In [33]:
source = df1['text'][8]
summary = df1['summary'][8]

In [34]:
print(source)

Antoun also purchased 2,500 shares of Genesis stock in his friend’s account on November 12, 2007.Abed sold 3,344 shares of Genesis stock on December 21 and 26, 2007 and tendered the rest of his shares in January 2008.Antoun sold the shares he had purchased for profits of nearly $34,000, while Abed | sold all of his Genesis stock and call option contracts for profits of over $50,000.26. On October 29 and November 2, 2007, Abed purchased 600 shares and 400 shares, respectively, of Genesis common stock.On July 21, 2006, Antoun purchased 900 shares of Genesis stock in his friend’s account in 
advance of a Genesis earnings announcement.On December 10, 2007, before the merger was announced publicly, Abed purchased an additional 10,000 shares of Genesis common stock and 70 additional Genesis call option contracts.Antoun purchased 9,750 shares of Genesis stock in his relative’s account in multiple transactions on November 12, 13 and 19, and December 3, 2007.The next day, the stock opened at $1

In [35]:
print(summary)

After news of the merger was announced on December 11, 2007, Genesis's stock price skyrocketed 57 percent.


In [36]:
prompt = PromptTemplate(template=template, input_variables=["source", "summary"])
llm_chain = LLMChain(prompt=prompt, 
                     llm=llm
                     )

In [37]:
output = llm_chain.run(source=source, summary=summary)
print(output)

 True


In [39]:
from tqdm import tqdm
outputs = []
for i in tqdm(range(df1.shape[0])):
    source = df1['text'][i]
    summary = df1['summary'][i]
    output = llm_chain.run(source=source, summary=summary)
    outputs.append(output)

100%|██████████| 17/17 [01:37<00:00,  5.73s/it]


In [40]:
outputs

[' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True',
 ' False',
 ' True',
 ' True',
 ' True',
 ' True',
 ' True']

In [None]:
df1['output'] = outputs
df.head()