In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [2]:
pdf_path = r'C:\Users\dell\Desktop\MyDocs\Docs\MK\reliance_disney_agreement.pdf'

In [3]:
!pip install prettytable



In [4]:
from prettytable import PrettyTable 
def print_pretty_table(extracted_values):
    """
    Prints the extracted key-value pairs in a formatted table.

    Args:
        extracted_values (dict): Dictionary of extracted key-value pairs.
    """
    # Create a PrettyTable instance
    table = PrettyTable()

    # Set column names
    table.field_names = ["Key", "Value"]

    # Add rows
    for key, value in extracted_values.items():
        table.add_row([key, value])

    # Print the table
    print(table)

In [5]:
!pip install PyPDF2 PyMuPDF pdfplumber



In [6]:
import fitz  # PyMuPDF
import pdfplumber
from PyPDF2 import PdfReader
import re

def extract_text_from_pdf(pdf_path, method="fitz"):
    """
    Extracts text from a PDF using the specified library (fitz, pdfplumber, or PyPDF2).

    Args:
        pdf_path (str): Path to the PDF file.
        method (str): The library to use for extraction ('fitz', 'pdfplumber', 'pypdf2').
                      Defaults to 'fitz' (PyMuPDF).

    Returns:
        str: Extracted and cleaned text from the PDF.
    """
    try:
        raw_text = ""
        
        # pdfplumber method
        if method == "pdfplumber":
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    raw_text += page.extract_text() or ""

        # PyPDF2 method
        elif method == "pypdf2":
            reader = PdfReader(pdf_path)
            for page in reader.pages:
                raw_text += page.extract_text() or ""

        # PyMuPDF (fitz) method
        else:
            document = fitz.open(pdf_path)
            for page_num in range(len(document)):
                page = document[page_num]
                raw_text += page.get_text()
            document.close()

        if not raw_text.strip():
            raise ValueError("No extractable text found in the PDF.")
        else:
            return raw_text

    except Exception as e:
        return f"Error processing the PDF with {method}: {str(e)}"

In [7]:
file_path = r'C:\Users\dell\Desktop\MyDocs\Docs\MK\reliance_disney_ocr.txt'

In [8]:
# Function to read ground truth from a file
def read_ground_truth(file_path):
    try:
        with open(file_path, 'r') as file:
            ground_truth = file.read().strip() 
        return ground_truth
    except FileNotFoundError:
        print(f"Error: The file at '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

In [9]:
import re

def preprocess_text(text):
    """
    Preprocesses the input text by:
    - Removing extra whitespaces
    - Normalizing newlines
    - Converting to lowercase
    - Optional: Removing punctuation or stopwords
    """
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces and trim
    text = text.replace('\n', ' ')  # Replace newlines with a space

    # Convert to lowercase
    text = text.lower()

    # Optional: Remove punctuation (if needed)
    # text = re.sub(r'[^\w\s]', '', text)  # Uncomment if punctuation removal is desired

    return text

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

def word_match_accuracy(extracted_text, ground_truth):
    # Preprocess both texts
    extracted_text = preprocess_text(extracted_text)
    ground_truth = preprocess_text(ground_truth)

    # Convert the texts into sets of words
    extracted_words = set(extracted_text.split())
    ground_truth_words = set(ground_truth.split())

    # Calculate precision, recall, and F1-score
    intersection = len(extracted_words & ground_truth_words)
    precision = intersection / len(extracted_words) if len(extracted_words) > 0 else 0
    recall = intersection / len(ground_truth_words) if len(ground_truth_words) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

In [11]:
!pip install python-Levenshtein



In [12]:
import Levenshtein

def levenshtein_accuracy(extracted_text, ground_truth):
    # Preprocess both texts
    extracted_text = preprocess_text(extracted_text)
    ground_truth = preprocess_text(ground_truth)
    
    # Compute Levenshtein distance and return the similarity ratio
    distance = Levenshtein.distance(extracted_text, ground_truth)
    max_len = max(len(extracted_text), len(ground_truth))
    
    # Similarity ratio
    similarity_ratio = 1 - (distance / max_len)
    return similarity_ratio

In [13]:
import chardet

with open(file_path, 'rb') as file:
    raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']

with open(file_path, 'r', encoding=encoding) as file:
    ground_truth = file.read()

In [14]:
extracted_text = extract_text_from_pdf(pdf_path, method="fitz")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 98.32%
Precision: 0.953, Recall: 0.953, F1-Score: 0.953


In [15]:
extracted_text = extract_text_from_pdf(pdf_path, method="pdfplumber")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 98.50%
Precision: 0.948, Recall: 0.952, F1-Score: 0.950


In [16]:
extracted_text = extract_text_from_pdf(pdf_path, method="pypdf2")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 97.85%
Precision: 0.880, Recall: 0.903, F1-Score: 0.892


In [17]:
# Clean the text
def clean_txt(raw_text):
    cleaned_text = re.sub(r"(page \d+ of \d+)", "", raw_text, flags=re.IGNORECASE)  # Remove page numbers
    cleaned_text = re.sub(r"(\n\s*\n)|(\r\n|\r|\n)", "\n", cleaned_text)  # Remove extra newlines
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Normalize whitespace
    cleaned_text = re.sub(r"[^\x00-\x7F₹$\s]+", " ", cleaned_text) # Remove non-ASCII characters
    normalized_text = cleaned_text.strip()

    return normalized_text

In [18]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 3.4 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/12.8 MB 3.1 MB/s eta 0:00:04
     ----- ---------------------------------- 1.8/12.8 MB 3.4 MB/s eta 0:00:04
     -------- ------------------------------- 2.6/12.8 MB 3.6 MB/s eta 0:00:03
     ----------- ---------------------------- 3.7/12.8 MB 3.9 MB/s eta 0:00:03
     --------------- ------------------------ 5.0/12.8 MB 4.3 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 4.5 MB/s eta 0:00:02
     ---------------------- ----------------- 7.1/12.8 MB 4.5 MB/s eta 0:00:02
     ------------------------- -------------- 8.1/12.8 MB 4.6 MB/s eta 0:00:02
     ----------------------------- ------

In [19]:
import spacy
import re

# Loading the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def key_value_extraction(text, keys):
    """
    Extract key values from a cleaned contract text based on the provided keys.

    Parameters:
    - text (str): Cleaned and preprocessed contract text.
    - keys (list, optional): A list of keys (strings) to extract. If None, all keys will be extracted.

    Returns:
    - dict: A dictionary containing the extracted key-value pairs.
    """
    # Process the text with spaCy for Named Entity Recognition (NER)
    doc = nlp(text)

    # Extract company names using NER (ORG - Organizations)
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]

    # Extract dates using regex pattern for dates
    date_pattern = r"\d{1,2}[a-z]{2}\s[A-Za-z]+\s\d{4}"
    dates = re.findall(date_pattern, text)

    # Extract financial values (e.g., investment amounts, transaction values) using regex
    amount_pattern = r"₹[\d,]+(?:\s*crore|\s*\(.*\))"
    amounts = re.findall(amount_pattern, text)

    # Define helper function to check for exclusive rights
    def check_exclusive_rights(text):
        """
        Function to check if the contract mentions exclusive rights.
        Returns True if exclusive rights are granted, otherwise False.
        """
        # Keywords or phrases that indicate exclusive rights
        exclusive_rights_keywords = ["exclusive rights", "granted exclusive rights", "exclusive distribution rights"]
        
        # Search for any of the keywords in the text
        for keyword in exclusive_rights_keywords:
            if re.search(rf"{keyword}", text, re.IGNORECASE):
                return True
        
        # If no keywords are found, return False
        return False

    # Check if the contract mentions exclusive rights
    exclusive_rights = check_exclusive_rights(text)

    # Prepare the default extracted information, note that the below config depends upon predefined
    # keywords and it can be done better for more domains
    extracted_info = {
        keys[0]: companies[0] if len(companies) > 0 else None,  # First company (Party 1)
        keys[1]: companies[1] if len(companies) > 1 else None,  # Second company (Party 2)
        keys[2]: dates[0] if dates else None,  # First date found
        keys[3]: amounts[0] if amounts else None,  # First investment amount found
        keys[4]: amounts[2] if len(amounts) > 1 else None,  # Second transaction value found
        keys[5]: "Yes" if exclusive_rights else "No"  # Boolean value indicating exclusive rights
    }

    # Return the filtered dictionary of extracted information
    return extracted_info

In [20]:
def save_to_csv(key_values, output_file):
    """
    Saves the extracted key-value pairs to a CSV file.

    Args:
        extracted_values (dict): Dictionary of extracted key-value pairs.
        output_file (str): Path to the output CSV file.
    """
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Key", "Value"])
        for key, value in key_values.items():
            writer.writerow([key, value])

In [21]:
def save_to_csv(key_values, output_file):
    """
    Saves the extracted key-value pairs to a CSV file.

    Args:
        extracted_values (dict): Dictionary of extracted key-value pairs.
        output_file (str): Path to the output CSV file.
    """
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Key", "Value"])
        for key, value in key_values.items():
            writer.writerow([key, value])

In [22]:
# Domain-specific patterns
domain_rules = {
    "Contracts": {
        "Name of the 1st Party": r"(?:First Party|Party 1|Party One):?\s*([^\n,]+)",
        "Name of the 2nd Party": r"(?:Second Party|Party 2|Party Two):?\s*([^\n,]+)",
        "Contract Start Date": r"(?:Effective Date|Start Date|Commencement):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Contract End Date": r"(?:End Date|Termination Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Scope of Work": r"(?:Scope of Work|Services):?\s*([^\n]+)",
        "Penalty Amount": r"(?:Penalty|Fine):?\s*\$?([\d,]+)"
    },
    "Finance": {
        "Transaction Amount": r"(?:Amount|Transaction):?\s*\$?([\d,]+)",
        "Date of Transaction": r"(?:Transaction Date|Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Account Number": r"(?:Account Number|Acc No):?\s*([^\n,]+)",
        "Bank Name": r"(?:Bank|Financial Institution):?\s*([^\n,]+)"
    },
    "Legal": {
        "Case Number": r"(?:Case Number|Case ID):?\s*([^\n,]+)",
        "Filing Date": r"(?:Filing Date|Date Filed):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Petitioner": r"(?:Petitioner|Claimant):?\s*([^\n,]+)",
        "Respondent": r"(?:Respondent|Defendant):?\s*([^\n,]+)",
        "Court Name": r"(?:Court|Jurisdiction):?\s*([^\n,]+)"
    },
    "HR": {
        "Employee Name": r"(?:Employee Name|Name):?\s*([^\n,]+)",
        "Employee ID": r"(?:Employee ID|ID):?\s*([^\n,]+)",
        "Joining Date": r"(?:Joining Date|Start Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Department": r"(?:Department|Team):?\s*([^\n,]+)",
        "Salary": r"(?:Salary|Compensation):?\s*\$?([\d,]+)"
    },
    "Invoices": {
        "Invoice Number": r"(?:Invoice Number|Invoice ID):?\s*([^\n,]+)",
        "Invoice Date": r"(?:Invoice Date|Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Client Name": r"(?:Client Name|Customer Name):?\s*([^\n,]+)",
        "Total Amount": r"(?:Total Amount|Total):?\s*\$?([\d,]+)",
        "Due Date": r"(?:Due Date|Payment Due):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})"
    }
}

In [23]:
from transformers import pipeline
import re

# Step 1: Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Step 2: Custom Summarization Function based on Keys and Values
def custom_summarization(text, keys, values):
    """
    Summarize text by focusing on specific keys and values.
    
    Args:
    - text (str): The full text to summarize.
    - keys (list): List of keys (such as 'Investment Amount', 'Party 1', etc.)
    - values (list): List of corresponding values (such as 'Reliance', '₹11,500 crore', etc.)
    
    Returns:
    - summary (str): A summary that focuses on the provided keys and values.
    """
    
    relevant_text = ""
    
    # Try to match key-value pairs and extract related text
    for key, value in zip(keys, values):
        # Find sentences containing the key and its value (this is a simple heuristic)
        pattern = rf"([^.]*{re.escape(value)}[^.]*\.)"
        matches = re.findall(pattern, text)
        
        # Combine the relevant sentences into the relevant_text
        relevant_text += " ".join(matches)
    
    # If no relevant text is found, use the whole text for summarization
    if not relevant_text:
        relevant_text = text
    
    # Summarize the extracted relevant text
    summary = summarizer(relevant_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']




Device set to use cpu


In [24]:
extracted_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_txt(extracted_text)
text = cleaned_text

keys = ['Party 1', 'Party 2', 'Date of Announcement', 'Investment', 'Transaction Value', 'Exclusive Rights']

key_values = key_value_extraction(text, keys)

values = [i for i in key_values.values()]

In [25]:
print_pretty_table(key_values)

+----------------------+--------------------------+
|         Key          |          Value           |
+----------------------+--------------------------+
|       Party 1        |         Reliance         |
|       Party 2        | the Joint Venture Disney |
| Date of Announcement |    28th February 2024    |
|      Investment      |      ₹11,500 crore       |
|  Transaction Value   |      ₹70,352 crore       |
|   Exclusive Rights   |           Yes            |
+----------------------+--------------------------+


In [26]:
def save_to_csv(key_values, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8', errors='ignore') as file:
        writer = csv.writer(file)
        writer.writerow(["Key", "Value"])
        for key, value in key_values.items():
            writer.writerow([key, value])

In [27]:
media_text = cleaned_text

custom_summary = custom_summarization(media_text, keys, values)

print(custom_summary)

Reliance Industries Limited, Viacom 18 Media Private Limited and The Walt Disney Company (NYSE:DIS) ( Disney ) today announced the signing of binding definitive agreements to form a joint venture ( JV) Reliance is India s largest private sector company, with a consolidated revenue of Rs 9,74,864 crore (US$118.5 billion) Reliances to invest ₹11,500 crore in the Joint Venture Disney to provide Content License to the Joint venture Mumbai / Burbank, Calif.


In [28]:
import csv
from datetime import datetime

# Function to log user corrections
def log_correction(original_summary, user_correction, feedback, keys, values):
    """
    Log corrections made by the user into a CSV file.

    Args:
    - original_summary (str): The summary generated by the model.
    - user_correction (str): The corrected summary provided by the user.
    - feedback (str): Additional user feedback or comments.
    - keys (list): List of keys used in the extraction process.
    - values (list): List of values used in the extraction process.
    """
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "original_summary": original_summary,
        "user_correction": user_correction,
        "feedback": feedback,
        "keys": ", ".join(keys),
        "values": ", ".join(map(str, values))
    }
    
    # Append to a CSV file
    with open("corrections_log.csv", mode="a", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=log_entry.keys())
        
        # Write the header if the file is new
        if file.tell() == 0:
            writer.writeheader()
        
        writer.writerow(log_entry)

In [29]:
keys

['Party 1',
 'Party 2',
 'Date of Announcement',
 'Investment',
 'Transaction Value',
 'Exclusive Rights']

In [30]:
original_summary = custom_summary
user_correction = input()
feedback = input()

# keys are predefined
keys = keys

values = input()

log_correction(original_summary, user_correction, feedback, keys, values)

 Test1
 Test2
 10/20/2024


In [31]:
import pandas as pd

# Load corrections from the CSV file
def load_corrections(file_path="corrections_log.csv"):
    """
    Load corrections logged by users into a DataFrame.

    Args:
    - file_path (str): Path to the corrections log CSV file.

    Returns:
    - DataFrame: User corrections and feedback.
    """
    return pd.read_csv(file_path)

In [32]:
corrections_df = load_corrections()
print(corrections_df.head())

                    timestamp  \
0  2025-01-08T22:30:11.114439   
1  2025-01-08T22:33:52.313421   

                                    original_summary user_correction feedback  \
0  Reliance Industries Limited, Viacom 18 Media P...            Test    Test2   
1  Reliance Industries Limited, Viacom 18 Media P...           Test1    Test2   

                                                keys  \
0  Party 1, Party 2, Date of Announcement, Invest...   
1  Party 1, Party 2, Date of Announcement, Invest...   

                         values  
0  1, 0, /, 2, 0, /, 2, 0, 2, 4  
1  1, 0, /, 2, 0, /, 2, 0, 2, 4  


In [33]:
!pip install accelerate>=0.26.0

In [36]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

# Load pre-trained model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Prepare data for fine-tuning
def prepare_data(df):
    """
    Prepare input-output pairs for fine-tuning the summarization model.

    Args:
    - df (DataFrame): DataFrame containing original and corrected summaries.

    Returns:
    - dict: Tokenized input and labels for fine-tuning.
    """
    inputs = tokenizer(list(df["original_summary"]), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    labels = tokenizer(list(df["user_correction"]), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Load corrections and prepare data
corrections_df = load_corrections()
data = prepare_data(corrections_df)

# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_bart",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data
)

# Start fine-tuning
trainer.train()

KeyboardInterrupt: 