In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
!pip install llama-index
!pip install transformers accelerate bitsandbytes



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=800,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
from llama_index import ServiceContext

dir = '/content/drive/MyDrive/test-data/judgement'

service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5")



Downloading (…)lve/main/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
data = '''o. 196 of 1954.
Under article 32 of the Constitution of India for the enforcement of fundamental rights.
P. R. Das and K. P. Khaitan, (B. Sen, A. K. Mukherjea and B. P. Maheswari, with them) for the petitioner.
C. K. Daphtary, Solicitor General for lndia, (Porus A. Mehta and P. G. Gokhale, with ', him) for the respondents.
May 28.
The Judgment of the Court was delivered by I MEHR CHAND MAHAJAN C.J.
The principal question canvassed in this case is whether certain sections of the Taxation on Income (Investigation Commission) Act, 1947, i.e., Act XXX of 1947, have become void from the date of the commencement of the Constitution of India by reason of article 14 of the ' Constitution.
The petitioner, Suraj Mall Mohta & Co. Ltd., is a company registered under the Indian Companies Act.
Suraj Mall Mohta is also the managing director of another company Messrs. Jute and Gunny Brokers Ltd. A reference had been made by the Central Government under the provisions of section 5(1) of the Act before 58 450 1st September, 1948, of the case of Messrs. Jute and Gunny Brokers Ltd. to the Investigation Commission appointed under Act XXX of 1947.
During the investigation of that case which was numbered 831/30 in the records of the Commission, and during the investigation of some other cases similarly referred to the Commission, it was said to have been discovered that the petitioner company had made secret profits which it had not disclosed and had thus evaded taxa tion.
On the 28th August, 1953, a report to this effect was made by the Commission to the Central Government under the provisions of section 5(4) of the Act requesting that the case of the petitioner along with the cases of Suraj Mall Mohta and other members of his family may be referred to the Commission for investigation.
On the 9th September, 1953, the Central Government referred these cases to the Investigation Commission under the provisions of section 5(4) of the Act and these were numbered 831/64 69 on the records of the Commission.
On the 15th of September, 1953, the Commission notified the petitioners that their cases had been referred for investigation and they were called upon to furnish certain material, as detailed in Annexure "B" of the petition, to the Commission.
On the 12th April, 1954, the present petition under article 32 of the Constitution was filed for the issue of appropriate writs restraining the Commission from taking any action against the petitioner under the provisions of Act XXX of 1947, on the ground that the provisions of sections 5(1), 5(4),6,7 and 8 of Act XXX of 1947, had become void, being discriminatory in character after the coming into force of the Constitution of India.
In order to appreciate the respective contentions raised and canvassed before us on behalf of the petitioner company and the State, it is necessary to set out some of the relevant provisions of the Act.
The object of the Act as stated in its Preamble was to ascertain whether the actual incidence of taxation on income in recent years had been in accordance with 451 provisions of law and whether the procedure for assessment and recovery of tax, was adequate to prevent evasion thereof.
Section 3 authorizes the Central Government to constitute a Commission, to be called the Income tax Investigation Commission, it,% duty, being (a) to investigate and report to the Central Government on all matters relating to taxation on income, with particular reference to the extent to which the existing law relating to, and procedure for, the assessment and collection of such taxation is adequate to prevent the evasion thereof; (b) to investigate in accordance with the provisions of this Act any case or points in a case referred to it under section 5.'''

In [None]:
class Message:
    def __init__(self, role, content):
        self.role = role
        self.content = content

m1 = Message(role="system", content="You are an AI that can summarise text")
m2 = Message(role="user", content=data)
M = [m1,m2]
chat = llm.chat(M)
print(chat)



assistant: The case involves a company, Suraj Mall Mohta & Co. Ltd., which was investigated by the Income Tax Investigation Commission for secret profits and evasion of taxation. The Commission discovered this during the investigation of another case and requested the Central Government to refer the case of Suraj Mall Mohta & Co. Ltd. for investigation. The company filed a petition under article 32 of the Constitution, arguing that the provisions of Act XXX of 1947, which authorized the Commission's investigation, had become void from the date of the Constitution's commencement due to their discriminatory nature. The company's contentions were canvassed before the court, and the relevant provisions of the Act were set out. The object of the Act was to ascertain whether the existing law and procedure for taxation on income were adequate to prevent evasion.


In [None]:
x = str(chat)
x = x.replace('assistant: ', '', 1)
print(x)

The case involves a company, Suraj Mall Mohta & Co. Ltd., which was investigated by the Income Tax Investigation Commission for secret profits and evasion of taxation. The Commission discovered this during the investigation of another case and requested the Central Government to refer the case of Suraj Mall Mohta & Co. Ltd. for investigation. The company filed a petition under article 32 of the Constitution, arguing that the provisions of Act XXX of 1947, which authorized the Commission's investigation, had become void from the date of the Constitution's commencement due to their discriminatory nature. The company's contentions were canvassed before the court, and the relevant provisions of the Act were set out. The object of the Act was to ascertain whether the existing law and procedure for taxation on income were adequate to prevent evasion.


In [None]:
import os
data_path = '/content/drive/MyDrive/test-data/judgement'
file_list = os.listdir(data_path)
res_path = '/content/drive/MyDrive/predictions'
res = ''
for name in file_list:
  with open(data_path+'/'+name,'r') as file:
    content = file.read()
    m3 = Message(role="user", content=content)
    message = [m1,m3]
    chat = llm.chat(message)
    res = str(chat)
    res = res.replace('assistant: ', '', 1)
  with open(res_path+'/'+name, "w") as file:
      file.write(res)


# with open(data_path+'/'+file_list[0], "r") as file:
#   content = file.read()
#   m3 = Message(role="user", content=content)
#   message = [m1,m3]
#   chat = llm.chat(message)
#   print(chat)

OutOfMemoryError: ignored

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
import re
from collections import Counter
from bert_score import score



# Initialize scores
total_bert_score = 0
total_precision_score = 0
total_recall_score = 0
total_f1_score = 0

# Lists to store precision and recall for BERTScore
precision_list = []
recall_list = []



In [None]:
!mkdir pred

mkdir: cannot create directory ‘pred’: File exists


In [None]:
import os
directory_path = '/content/drive/MyDrive/predictions'
validation_path = '/content/drive/MyDrive/test-data/summary/'
file_list = os.listdir(directory_path)

for file_name in file_list:
    file_path = os.path.join(directory_path, file_name)
    validation_file = validation_path+file_name
    # Check if the item is a file (not a subdirectory)
    with open(file_path, "r") as file:
        text = file.read()

    with open(validation_file, "r") as file:
        validation_text = file.read()

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import os
from rouge import Rouge
from bert_score import score

# Define the directory paths
directory_path = '/content/drive/MyDrive/predictions'
validation_path = '/content/drive/MyDrive/test-data/summary/'
file_list = os.listdir(directory_path)

# Initialize lists to store scores
total_precision_score = 0.0
total_recall_score = 0.0
total_f1_score = 0.0
total_bert_score = 0.0
precision_list = []
recall_list = []
bert_score_list = []

# Initialize the ROUGE scorer
rouge = Rouge()

for file_name in file_list:
    file_path = os.path.join(directory_path, file_name)
    validation_file = os.path.join(validation_path, file_name)

    # Check if the item is a file (not a subdirectory)
    if os.path.isfile(file_path):
        print(file_path)
        # Read text from the generated summary file
        with open(file_path, "r") as file:
            candidate = file.read()

        # Read the reference summary text
        with open(validation_file, "r") as file:
            reference = file.read()

        # Calculate ROUGE scores
        rouge_scores = rouge.get_scores(candidate, reference)

        # Extract precision, recall, and F1 scores
        precision = rouge_scores[0]["rouge-1"]["r"]
        recall = rouge_scores[0]["rouge-1"]["p"]
        f1_score = rouge_scores[0]["rouge-1"]["f"]

        # Update total ROUGE scores
        total_precision_score += float(precision)
        total_recall_score += float(recall)
        total_f1_score += float(f1_score)

        # Append precision and recall to the lists
        precision_list.append(float(precision))
        recall_list.append(float(recall))

        # Calculate BERTScore
        P, R, F1 = score([candidate], [reference], lang="en")
        bert_score = F1.item()

        # Update total BERTScore
        total_bert_score += bert_score

        # Append BERTScore to the list
        bert_score_list.append(bert_score)

# Calculate average ROUGE scores
num_files = len(file_list)
avg_precision_score = total_precision_score / num_files
avg_recall_score = total_recall_score / num_files
avg_f1_score = total_f1_score / num_files

# Calculate average BERTScore
avg_bert_score = total_bert_score / num_files

print(f'Average ROUGE Precision Score: {avg_precision_score:.6f}')
print(f'Average ROUGE Recall Score: {avg_recall_score:.6f}')
print(f'Average ROUGE F1 Score: {avg_f1_score:.6f}')
print(f'Average BERTScore: {avg_bert_score:.6f}')

# Print precision and recall for ROUGE and BERTScore
for i, (precision, recall, bert_score) in enumerate(zip(precision_list, recall_list, bert_score_list)):
    print(f'File {i + 1} - ROUGE Precision: {precision:.6f}, ROUGE Recall: {recall:.6f}, BERTScore: {bert_score:.6f}')


/content/drive/MyDrive/predictions/2593.txt


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


/content/drive/MyDrive/predictions/5141.txt


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average ROUGE Precision Score: 0.244219
Average ROUGE Recall Score: 0.680820
Average ROUGE F1 Score: 0.351031
Average BERTScore: 0.861921
File 1 - ROUGE Precision: 0.321300, ROUGE Recall: 0.659259, BERTScore: 0.873549
File 2 - ROUGE Precision: 0.167139, ROUGE Recall: 0.702381, BERTScore: 0.850293
