## Setup & Installation

In [24]:
!unzip /content/CUAD_v1.zip -d /content/CUAD

Archive:  /content/CUAD_v1.zip
   creating: /content/CUAD/CUAD_v1/
  inflating: /content/CUAD/CUAD_v1/CUAD_v1.json  
  inflating: /content/CUAD/CUAD_v1/CUAD_v1_README.txt  
   creating: /content/CUAD/CUAD_v1/full_contract_pdf/
   creating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/
   creating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/
  inflating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf  
  inflating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf  
  inflating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf  
  inflating: /content/CUAD/CUAD_v1/full_contract_pdf/Part_I/Affiliate_Agreements/LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-1

In [25]:
!pip install -q transformers datasets torch torchvision torchaudio
!pip install -q scikit-learn pandas numpy matplotlib seaborn plotly
!pip install -q spacy nltk textblob wordcloud
!pip install -q accelerate sentencepiece protobuf
!pip install -q pdfplumber PyPDF2 python-docx

!python -m spacy download en_core_web_sm

import warnings
warnings.filterwarnings('ignore')

print("✓ Installation complete")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-an

## Import Libraries

In [26]:
import os
import json
import re
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter, defaultdict
from tqdm.auto import tqdm

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    pipeline,
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

nlp = spacy.load('en_core_web_sm')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported")

Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB
✓ Libraries imported


## Load CUAD Dataset

In [27]:
CUAD_PATH = "/content/CUAD/CUAD_v1"

def load_cuad_data(base_path):
    contracts = []
    labels_data = []

    full_contract_txt = os.path.join(base_path, "full_contract_txt")
    label_group_path = os.path.join(base_path, "label_group_xlsx")

    if os.path.exists(full_contract_txt):
        txt_files = glob.glob(os.path.join(full_contract_txt, "*.txt"))
        for txt_file in tqdm(txt_files, desc="Loading contracts"):
            try:
                with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    contract_name = os.path.basename(txt_file).replace('.txt', '')
                    contracts.append({
                        'contract_id': contract_name,
                        'text': text,
                        'length': len(text),
                        'word_count': len(text.split())
                    })
            except Exception as e:
                print(f"Error reading {txt_file}: {e}")

    if os.path.exists(label_group_path):
        xlsx_files = glob.glob(os.path.join(label_group_path, "*.xlsx"))
        for xlsx_file in xlsx_files:
            try:
                df = pd.read_excel(xlsx_file)
                label_type = os.path.basename(xlsx_file).replace('Label Report - ', '').replace('.xlsx', '')
                df['label_type'] = label_type
                labels_data.append(df)
            except Exception as e:
                print(f"Error reading {xlsx_file}: {e}")

    contracts_df = pd.DataFrame(contracts)

    if labels_data:
        labels_df = pd.concat(labels_data, ignore_index=True)
    else:
        labels_df = pd.DataFrame()

    return contracts_df, labels_df

contracts_df, labels_df = load_cuad_data(CUAD_PATH)

print(f"Loaded {len(contracts_df)} contracts")
print(f"Loaded {len(labels_df)} label annotations")
print(f"\nContracts DataFrame shape: {contracts_df.shape}")
print(f"Labels DataFrame shape: {labels_df.shape}")

print("\nContract columns:", contracts_df.columns.tolist())
if len(labels_df) > 0:
    print("Labels columns:", labels_df.columns.tolist())

contracts_df.head()

Loading contracts:   0%|          | 0/510 [00:00<?, ?it/s]

Loaded 510 contracts
Loaded 5183 label annotations

Contracts DataFrame shape: (510, 4)
Labels DataFrame shape: (5183, 49)

Contract columns: ['contract_id', 'text', 'length', 'word_count']
Labels columns: ['Filename', 'ROFR-ROFO-ROFN', 'label_type', 'Change of Control', 'Anti-assignment', 'Joint IP Ownership', 'Revenue-Profit Sharing', 'Agreement Date', 'Agreement Date-Answer', 'Effective Date', 'Effective Date-Answer', 'Expiration Date', 'Expiration Date-Answer', 'Renewal Term', 'Renewal Term-Answer', 'Notice Period to Terminate Renewal', 'Notice Period to Terminate Renewal- Answer', 'Most Favored Nation', 'Price Restrictions', 'License Grant', 'Non-Transferable License', 'Affiliate License-Licensor', 'Affiliate License-Licensee', 'Irrevocable or Perpetual License', 'Minimum Commitment', 'Parties', 'Parties-Answer', 'Volume Restriction', 'No-Solicit of Employees', 'Warranty Duration', 'Post-termination Services', 'Insurance', 'Source Code Escrow', 'Non-Disparagement', 'Document Name'

Unnamed: 0,contract_id,text,length,word_count
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,1 ...,49518,5558
1,PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL...,Exhibit 10.2 INTELLECTUAL PROPERTY AGREEMENT...,23282,3596
2,IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1...,Exhibit 99.1 [***] = Certain confidential info...,73860,11394
3,PacificapEntertainmentHoldingsInc_20051115_8-K...,PACIFICAP ENTERTAINMENT Agreement with THE HEN...,18138,2500
4,"FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...",Exhibit 99.4 STRATEGIC ALLIANCE AGREEMENT ...,37829,5761


## Data Exploration & Visualization

In [28]:
print("="*80)
print("DATASET STATISTICS")
print("="*80)

print(f"\nTotal Contracts: {len(contracts_df)}")
print(f"Average Contract Length: {contracts_df['length'].mean():,.0f} characters")
print(f"Average Word Count: {contracts_df['word_count'].mean():,.0f} words")
print(f"Median Contract Length: {contracts_df['length'].median():,.0f} characters")
print(f"Max Contract Length: {contracts_df['length'].max():,.0f} characters")
print(f"Min Contract Length: {contracts_df['length'].min():,.0f} characters")

if len(labels_df) > 0:
    print(f"\nTotal Label Annotations: {len(labels_df)}")
    print(f"Unique Label Types: {labels_df['label_type'].nunique()}")
    print("\nLabel Type Distribution:")
    print(labels_df['label_type'].value_counts())

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Contract Length Distribution', 'Word Count Distribution',
                    'Contract Length (Log Scale)', 'Label Type Distribution'),
    specs=[[{"type": "histogram"}, {"type": "histogram"}],
           [{"type": "histogram"}, {"type": "bar"}]]
)

fig.add_trace(
    go.Histogram(x=contracts_df['length'], nbinsx=50, name='Length'),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=contracts_df['word_count'], nbinsx=50, name='Words'),
    row=1, col=2
)

fig.add_trace(
    go.Histogram(x=np.log10(contracts_df['length'] + 1), nbinsx=50, name='Log Length'),
    row=2, col=1
)

if len(labels_df) > 0:
    label_counts = labels_df['label_type'].value_counts()
    fig.add_trace(
        go.Bar(x=label_counts.index, y=label_counts.values, name='Labels'),
        row=2, col=2
    )

fig.update_layout(height=800, showlegend=False, title_text="CUAD Dataset Overview")
fig.update_xaxes(title_text="Characters", row=1, col=1)
fig.update_xaxes(title_text="Words", row=1, col=2)
fig.update_xaxes(title_text="Log10(Characters)", row=2, col=1)
fig.update_xaxes(tickangle=-45, row=2, col=2)
fig.show()

print("\nSample Contract Preview:")
print("="*80)
print(contracts_df['text'].iloc[0][:1000])
print("...")

DATASET STATISTICS

Total Contracts: 510
Average Contract Length: 52,563 characters
Average Word Count: 7,861 words
Median Contract Length: 33,143 characters
Max Contract Length: 338,211 characters
Min Contract Length: 645 characters

Total Label Annotations: 5183
Unique Label Types: 28

Label Type Distribution:
label_type
Parties                                                        510
Document Name                                                  510
Governing Law                                                  510
Dates (Group 1)                                                501
Anti-assignment, CIC (Group 3)                                 376
Uncapped Liability (Group 5)                                   275
Post-Termination Services                                      271
Audit Rights                                                   271
Licenses (Group 4)                                             261
Non-Compete, Exclusivity, No-Solicit of Customers (Group 2)    241
Termi


Sample Contract Preview:
1                                                                    Exhibit 10.16

                                                                 CONFIDENTIAL Portions of this Exhibit have been omitted pursuant to a request for confidential treatment. The omitted portions, marked by [****], have been separately filed with the Commission.

                           SPONSORSHIP AGREEMENT

This agreement ("Agreement") is entered into as of the 23rd day of September, 1997 ("Effective Date"), by and between Excite, Inc., a California corporation, located at 555 Broadway, Redwood City, California 94063 ("Excite"), and N2K Inc., a Pennsylvania corporation, located at 55 Broad Street, 26th Floor, New York, New York, 10004 ("Sponsor").

                                 RECITALS

A.   Excite maintains a site on the Internet at http://www.excite.com (the      "Excite Site") and owns and/or manages related Web Sites worldwide      (collectively, the "Excite Network") 

## Text Preprocessing

In [30]:
class LegalTextPreprocessor:
    def __init__(self):
        self.nlp = nlp

    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    def segment_contract(self, text, max_length=512):
        doc = self.nlp(text[:1000000])
        sentences = [sent.text.strip() for sent in doc.sents]

        segments = []
        current_segment = []
        current_length = 0

        for sent in sentences:
            sent_length = len(sent.split())
            if current_length + sent_length > max_length and current_segment:
                segments.append(' '.join(current_segment))
                current_segment = [sent]
                current_length = sent_length
            else:
                current_segment.append(sent)
                current_length += sent_length

        if current_segment:
            segments.append(' '.join(current_segment))

        return segments

    def extract_entities(self, text):
        doc = self.nlp(text[:100000])
        entities = {
            'organizations': [],
            'persons': [],
            'money': [],
            'dates': [],
            'locations': []
        }

        for ent in doc.ents:
            if ent.label_ == 'ORG':
                entities['organizations'].append(ent.text)
            elif ent.label_ == 'PERSON':
                entities['persons'].append(ent.text)
            elif ent.label_ in ['MONEY', 'CARDINAL']:
                entities['money'].append(ent.text)
            elif ent.label_ == 'DATE':
                entities['dates'].append(ent.text)
            elif ent.label_ in ['GPE', 'LOC']:
                entities['locations'].append(ent.text)

        return entities

preprocessor = LegalTextPreprocessor()

print("Processing contracts...")
contracts_df['cleaned_text'] = contracts_df['text'].apply(preprocessor.clean_text)
contracts_df['segments'] = contracts_df['cleaned_text'].apply(
    lambda x: preprocessor.segment_contract(x, max_length=256)
)
contracts_df['num_segments'] = contracts_df['segments'].apply(len)

print(f"✓ Processed {len(contracts_df)} contracts")
print(f"Average segments per contract: {contracts_df['num_segments'].mean():.1f}")

fig = px.histogram(
    contracts_df,
    x='num_segments',
    nbins=50,
    title='Distribution of Segments per Contract',
    labels={'num_segments': 'Number of Segments'},
    color_discrete_sequence=['steelblue']
)
fig.update_layout(height=400)
fig.show()

contracts_df[['contract_id', 'word_count', 'num_segments']].head(10)

Processing contracts...
✓ Processed 510 contracts
Average segments per contract: 35.1


Unnamed: 0,contract_id,word_count,num_segments
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,5558,25
1,PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL...,3596,16
2,IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1...,11394,49
3,PacificapEntertainmentHoldingsInc_20051115_8-K...,2500,12
4,"FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...",5761,26
5,IGENEBIOTECHNOLOGYINC_05_13_2003-EX-1-JOINT VE...,11330,51
6,ScansourceInc_20190822_10-K_EX-10.38_11793958_...,2044,9
7,HertzGroupRealtyTrustInc_20190920_S-11A_EX-10....,2236,10
8,SPORTHALEYINC_09_29_1997-EX-10.2-10-ENDORSEMEN...,1537,7
9,IntegrityFunds_20200121_485BPOS_EX-99.E UNDR C...,3784,19


## Zero-Shot NER with Legal-BERT

In [31]:
print("="*80)
print("ZERO-SHOT NER WITH LEGAL-BERT")
print("="*80)

MODEL_NER = "dslim/bert-base-NER"

try:
    ner_pipeline = pipeline(
        "ner",
        model=MODEL_NER,
        tokenizer=MODEL_NER,
        aggregation_strategy="simple",
        device=0 if torch.cuda.is_available() else -1
    )
    print(f"✓ Loaded NER model: {MODEL_NER}")
except Exception as e:
    print(f"Error loading primary model, trying alternative: {e}")
    MODEL_NER = "Jean-Baptiste/camembert-ner"
    ner_pipeline = pipeline(
        "ner",
        model=MODEL_NER,
        aggregation_strategy="simple",
        device=0 if torch.cuda.is_available() else -1
    )
    print(f"✓ Loaded alternative NER model: {MODEL_NER}")

def extract_legal_entities_zeroshot(text, max_length=512):
    entities = {
        'PERSON': [],
        'ORG': [],
        'LOC': [],
        'DATE': [],
        'MONEY': [],
        'OTHER': []
    }

    chunks = [text[i:i+max_length*4] for i in range(0, len(text), max_length*4)]

    for chunk in chunks[:3]:
        try:
            results = ner_pipeline(chunk)
            for entity in results:
                entity_type = entity['entity_group']
                entity_text = entity['word']
                score = entity['score']

                if entity_type in entities:
                    entities[entity_type].append({
                        'text': entity_text,
                        'score': score,
                        'start': entity.get('start', 0),
                        'end': entity.get('end', 0)
                    })
                else:
                    entities['OTHER'].append({
                        'text': entity_text,
                        'type': entity_type,
                        'score': score
                    })
        except Exception as e:
            continue

    return entities

print("\nExtracting entities from contracts...")
sample_size = min(50, len(contracts_df))
sample_contracts = contracts_df.head(sample_size)

all_entities = []
for idx, row in tqdm(sample_contracts.iterrows(), total=len(sample_contracts), desc="NER Extraction"):
    contract_text = row['text'][:5000]
    entities = extract_legal_entities_zeroshot(contract_text)

    all_entities.append({
        'contract_id': row['contract_id'],
        'entities': entities,
        'num_persons': len(entities['PERSON']),
        'num_orgs': len(entities['ORG']),
        'num_locations': len(entities['LOC']),
        'num_dates': len(entities['DATE']),
        'num_money': len(entities['MONEY']),
        'num_other': len(entities['OTHER'])
    })

entities_df = pd.DataFrame(all_entities)

print("\n" + "="*80)
print("NER EXTRACTION RESULTS")
print("="*80)
print(f"Processed {len(entities_df)} contracts")
print(f"\nAverage entities per contract:")
print(f"  Persons: {entities_df['num_persons'].mean():.2f}")
print(f"  Organizations: {entities_df['num_orgs'].mean():.2f}")
print(f"  Locations: {entities_df['num_locations'].mean():.2f}")
print(f"  Dates: {entities_df['num_dates'].mean():.2f}")
print(f"  Money: {entities_df['num_money'].mean():.2f}")

entity_counts = entities_df[['num_persons', 'num_orgs', 'num_locations', 'num_dates', 'num_money']].sum()

fig = go.Figure(data=[
    go.Bar(
        x=['Persons', 'Organizations', 'Locations', 'Dates', 'Money'],
        y=entity_counts.values,
        marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
    )
])
fig.update_layout(
    title='Total Named Entities Extracted',
    xaxis_title='Entity Type',
    yaxis_title='Count',
    height=500
)
fig.show()

print("\nSample Entity Extraction:")
print("="*80)
for i in range(min(3, len(entities_df))):
    print(f"\nContract: {entities_df['contract_id'].iloc[i]}")
    ents = entities_df['entities'].iloc[i]
    for entity_type, entity_list in ents.items():
        if entity_list and entity_type != 'OTHER':
            print(f"  {entity_type}:")
            for ent in entity_list[:3]:
                if isinstance(ent, dict):
                    print(f"    - {ent.get('text', ent)} (score: {ent.get('score', 0):.3f})")

entities_df.head()

ZERO-SHOT NER WITH LEGAL-BERT


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


✓ Loaded NER model: dslim/bert-base-NER

Extracting entities from contracts...


NER Extraction:   0%|          | 0/50 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



NER EXTRACTION RESULTS
Processed 50 contracts

Average entities per contract:
  Persons: 0.00
  Organizations: 36.38
  Locations: 8.92
  Dates: 0.00
  Money: 0.00



Sample Entity Extraction:

Contract: N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT
  ORG:
    - Commission (score: 0.999)
    - Excite, Inc (score: 0.998)
    - N2K Inc (score: 0.866)
  LOC:
    - California (score: 1.000)
    - Broadway (score: 0.990)
    - Redwood City (score: 0.996)

Contract: PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELLECTUAL PROPERTY AGREEMENT
  ORG:
    - Mar (score: 0.996)
    - ##v Enterprises, LLC (score: 0.930)
    - Limited Liability Company (score: 0.995)
  LOC:
    - Nevada (score: 0.999)
    - Colorado (score: 0.998)
    - U. S (score: 0.990)

Contract: IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1_Reseller Agreement
  ORG:
    - Securities and Exchange Commission (score: 0.999)
    - Exchange (score: 0.972)
    - IPA (score: 0.546)
  LOC:
    - IN (score: 0.424)
    - Bridge Parkway (score: 0.969)
    - Redwood Shores (score: 0.996)


Unnamed: 0,contract_id,entities,num_persons,num_orgs,num_locations,num_dates,num_money,num_other
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,"{'PERSON': [], 'ORG': [{'text': 'Commission', ...",0,39,9,0,0,16
1,PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL...,"{'PERSON': [], 'ORG': [{'text': 'Mar', 'score'...",0,18,5,0,0,35
2,IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1...,"{'PERSON': [], 'ORG': [{'text': 'Securities an...",0,62,8,0,0,18
3,PacificapEntertainmentHoldingsInc_20051115_8-K...,"{'PERSON': [], 'ORG': [{'text': '##CI', 'score...",0,2,10,0,0,8
4,"FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...","{'PERSON': [], 'ORG': [{'text': 'E', 'score': ...",0,25,5,0,0,10


## Multi-Perspective Risk Analysis Model Setup

In [32]:
print("="*80)
print("MULTI-PERSPECTIVE RISK ANALYSIS SETUP")
print("="*80)

RISK_MODEL = "nlpaueb/legal-bert-base-uncased"

tokenizer_risk = AutoTokenizer.from_pretrained(RISK_MODEL)
print(f"✓ Loaded tokenizer: {RISK_MODEL}")

RISK_CATEGORIES = {
    'termination': 'high',
    'liability': 'high',
    'indemnification': 'high',
    'penalty': 'high',
    'confidentiality': 'medium',
    'non-compete': 'medium',
    'dispute': 'medium',
    'notice': 'low',
    'governing law': 'low',
    'amendment': 'low',
    'payment': 'medium',
    'intellectual property': 'medium',
    'warranty': 'low',
    'force majeure': 'medium'
}

CLAUSE_PATTERNS = {
    'termination': r'\b(terminat|cancel|end|cease|discontinu)\w*\b',
    'liability': r'\b(liable|liability|responsible|damages)\b',
    'indemnification': r'\b(indemnif|hold harmless|defend)\w*\b',
    'confidentiality': r'\b(confidential|proprietary|secret|private)\w*\b',
    'payment': r'\b(payment|pay|fee|price|cost|invoice)\w*\b',
    'intellectual_property': r'\b(intellectual property|copyright|patent|trademark|ip)\b',
    'warranty': r'\b(warrant|guarantee|represent|assur)\w*\b',
    'governing_law': r'\b(governing law|jurisdiction|venue)\b',
    'dispute': r'\b(dispute|arbitration|litigation|conflict)\w*\b',
    'force_majeure': r'\b(force majeure|act of god|unavoidable)\b'
}

def classify_clause_type(text):
    text_lower = text.lower()
    matches = []

    for clause_type, pattern in CLAUSE_PATTERNS.items():
        if re.search(pattern, text_lower):
            matches.append(clause_type)

    if not matches:
        return 'general', 'low'

    primary_type = matches[0]
    risk_level = RISK_CATEGORIES.get(primary_type, 'low')

    return primary_type, risk_level

def analyze_perspective(text, clause_type, risk_level):
    text_lower = text.lower()

    party_a_indicators = {
        'favorable': ['right', 'entitle', 'may', 'option', 'discretion', 'approve'],
        'unfavorable': ['shall', 'must', 'require', 'obligation', 'liable', 'penalty']
    }

    party_b_indicators = {
        'favorable': ['compensation', 'payment', 'reimburs', 'benefit', 'protect'],
        'unfavorable': ['restrict', 'prohibit', 'forbid', 'prevent', 'limit']
    }

    party_a_score = sum(1 for word in party_a_indicators['favorable'] if word in text_lower)
    party_a_score -= sum(1 for word in party_a_indicators['unfavorable'] if word in text_lower)

    party_b_score = sum(1 for word in party_b_indicators['favorable'] if word in text_lower)
    party_b_score -= sum(1 for word in party_b_indicators['unfavorable'] if word in text_lower)

    perspectives = {
        'party_a': {
            'favorability': 'favorable' if party_a_score > 0 else ('unfavorable' if party_a_score < 0 else 'neutral'),
            'score': party_a_score,
            'risk_level': risk_level
        },
        'party_b': {
            'favorability': 'favorable' if party_b_score > 0 else ('unfavorable' if party_b_score < 0 else 'neutral'),
            'score': party_b_score,
            'risk_level': risk_level
        },
        'neutral': {
            'favorability': 'neutral',
            'score': 0,
            'risk_level': risk_level
        }
    }

    return perspectives

print("✓ Risk analysis functions initialized")
print(f"✓ Tracking {len(CLAUSE_PATTERNS)} clause types")
print(f"✓ Risk categories: {set(RISK_CATEGORIES.values())}")

MULTI-PERSPECTIVE RISK ANALYSIS SETUP
✓ Loaded tokenizer: nlpaueb/legal-bert-base-uncased
✓ Risk analysis functions initialized
✓ Tracking 10 clause types
✓ Risk categories: {'high', 'low', 'medium'}


## Perform Multi-Perspective Risk Analysis

In [33]:
print("="*80)
print("PERFORMING MULTI-PERSPECTIVE RISK ANALYSIS")
print("="*80)

def analyze_contract_comprehensive(contract_text, contract_id):
    segments = preprocessor.segment_contract(contract_text, max_length=256)

    clause_analyses = []
    risk_summary = {'high': 0, 'medium': 0, 'low': 0}
    perspective_summary = {
        'party_a': {'favorable': 0, 'unfavorable': 0, 'neutral': 0},
        'party_b': {'favorable': 0, 'unfavorable': 0, 'neutral': 0}
    }
    clause_types_found = Counter()

    for idx, segment in enumerate(segments[:30]):
        if len(segment.split()) < 10:
            continue

        clause_type, risk_level = classify_clause_type(segment)
        perspectives = analyze_perspective(segment, clause_type, risk_level)

        clause_analyses.append({
            'segment_id': idx,
            'text': segment[:200],
            'clause_type': clause_type,
            'risk_level': risk_level,
            'party_a_favorability': perspectives['party_a']['favorability'],
            'party_a_score': perspectives['party_a']['score'],
            'party_b_favorability': perspectives['party_b']['favorability'],
            'party_b_score': perspectives['party_b']['score']
        })

        risk_summary[risk_level] += 1
        clause_types_found[clause_type] += 1
        perspective_summary['party_a'][perspectives['party_a']['favorability']] += 1
        perspective_summary['party_b'][perspectives['party_b']['favorability']] += 1

    return {
        'contract_id': contract_id,
        'clause_analyses': clause_analyses,
        'risk_summary': risk_summary,
        'perspective_summary': perspective_summary,
        'clause_types': dict(clause_types_found),
        'total_segments_analyzed': len(clause_analyses)
    }

print("Analyzing contracts from multiple perspectives...")
comprehensive_analyses = []

analysis_size = min(20, len(contracts_df))
for idx in tqdm(range(analysis_size), desc="Multi-Perspective Analysis"):
    contract_text = contracts_df['text'].iloc[idx]
    contract_id = contracts_df['contract_id'].iloc[idx]

    analysis = analyze_contract_comprehensive(contract_text, contract_id)
    comprehensive_analyses.append(analysis)

print(f"\n✓ Analyzed {len(comprehensive_analyses)} contracts")

risk_totals = {'high': 0, 'medium': 0, 'low': 0}
party_a_totals = {'favorable': 0, 'unfavorable': 0, 'neutral': 0}
party_b_totals = {'favorable': 0, 'unfavorable': 0, 'neutral': 0}
all_clause_types = Counter()

for analysis in comprehensive_analyses:
    for risk, count in analysis['risk_summary'].items():
        risk_totals[risk] += count
    for favor, count in analysis['perspective_summary']['party_a'].items():
        party_a_totals[favor] += count
    for favor, count in analysis['perspective_summary']['party_b'].items():
        party_b_totals[favor] += count
    for clause_type, count in analysis['clause_types'].items():
        all_clause_types[clause_type] += count

print("\n" + "="*80)
print("OVERALL RISK ANALYSIS RESULTS")
print("="*80)
print(f"\nTotal Clauses Analyzed: {sum(risk_totals.values())}")
print(f"\nRisk Distribution:")
print(f"  High Risk:   {risk_totals['high']:4d} ({risk_totals['high']/sum(risk_totals.values())*100:5.1f}%)")
print(f"  Medium Risk: {risk_totals['medium']:4d} ({risk_totals['medium']/sum(risk_totals.values())*100:5.1f}%)")
print(f"  Low Risk:    {risk_totals['low']:4d} ({risk_totals['low']/sum(risk_totals.values())*100:5.1f}%)")

print(f"\nParty A Perspective:")
print(f"  Favorable:   {party_a_totals['favorable']:4d} ({party_a_totals['favorable']/sum(party_a_totals.values())*100:5.1f}%)")
print(f"  Unfavorable: {party_a_totals['unfavorable']:4d} ({party_a_totals['unfavorable']/sum(party_a_totals.values())*100:5.1f}%)")
print(f"  Neutral:     {party_a_totals['neutral']:4d} ({party_a_totals['neutral']/sum(party_a_totals.values())*100:5.1f}%)")

print(f"\nParty B Perspective:")
print(f"  Favorable:   {party_b_totals['favorable']:4d} ({party_b_totals['favorable']/sum(party_b_totals.values())*100:5.1f}%)")
print(f"  Unfavorable: {party_b_totals['unfavorable']:4d} ({party_b_totals['unfavorable']/sum(party_b_totals.values())*100:5.1f}%)")
print(f"  Neutral:     {party_b_totals['neutral']:4d} ({party_b_totals['neutral']/sum(party_b_totals.values())*100:5.1f}%)")

print(f"\nTop 10 Clause Types:")
for clause_type, count in all_clause_types.most_common(10):
    print(f"  {clause_type:20s}: {count:4d}")

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Risk Level Distribution', 'Party A Perspective',
                    'Party B Perspective', 'Top Clause Types'),
    specs=[[{"type": "pie"}, {"type": "pie"}],
           [{"type": "pie"}, {"type": "bar"}]]
)

fig.add_trace(
    go.Pie(labels=list(risk_totals.keys()), values=list(risk_totals.values()),
           marker=dict(colors=['#FF6B6B', '#FFA500', '#90EE90'])),
    row=1, col=1
)

fig.add_trace(
    go.Pie(labels=list(party_a_totals.keys()), values=list(party_a_totals.values()),
           marker=dict(colors=['#4ECDC4', '#FF6B6B', '#C0C0C0'])),
    row=1, col=2
)

fig.add_trace(
    go.Pie(labels=list(party_b_totals.keys()), values=list(party_b_totals.values()),
           marker=dict(colors=['#95E1D3', '#F38181', '#D3D3D3'])),
    row=2, col=1
)

top_clauses = all_clause_types.most_common(10)
fig.add_trace(
    go.Bar(x=[c[0] for c in top_clauses], y=[c[1] for c in top_clauses],
           marker_color='steelblue'),
    row=2, col=2
)

fig.update_layout(height=900, showlegend=True, title_text="Multi-Perspective Risk Analysis Results")
fig.update_xaxes(tickangle=-45, row=2, col=2)
fig.show()

PERFORMING MULTI-PERSPECTIVE RISK ANALYSIS
Analyzing contracts from multiple perspectives...


Multi-Perspective Analysis:   0%|          | 0/20 [00:00<?, ?it/s]


✓ Analyzed 20 contracts

OVERALL RISK ANALYSIS RESULTS

Total Clauses Analyzed: 382

Risk Distribution:
  High Risk:    217 ( 56.8%)
  Medium Risk:  120 ( 31.4%)
  Low Risk:      45 ( 11.8%)

Party A Perspective:
  Favorable:     94 ( 24.6%)
  Unfavorable:  139 ( 36.4%)
  Neutral:      149 ( 39.0%)

Party B Perspective:
  Favorable:     77 ( 20.2%)
  Unfavorable:   97 ( 25.4%)
  Neutral:      208 ( 54.5%)

Top 10 Clause Types:
  termination         :  144
  confidentiality     :   62
  liability           :   62
  payment             :   53
  general             :   22
  indemnification     :   11
  intellectual_property:   10
  warranty            :    8
  governing_law       :    5
  dispute             :    5


## Detailed Contract Report

In [34]:
def generate_contract_report(analysis):
    contract_id = analysis['contract_id']

    print("\n" + "="*80)
    print(f"CONTRACT ANALYSIS REPORT: {contract_id}")
    print("="*80)

    print(f"\n📊 SUMMARY STATISTICS")
    print(f"  Total Segments Analyzed: {analysis['total_segments_analyzed']}")

    print(f"\n⚠️  RISK ASSESSMENT")
    for risk_level, count in analysis['risk_summary'].items():
        print(f"  {risk_level.upper():10s}: {count:3d} clauses")

    print(f"\n👤 PARTY A PERSPECTIVE")
    for favorability, count in analysis['perspective_summary']['party_a'].items():
        print(f"  {favorability.capitalize():12s}: {count:3d} clauses")

    print(f"\n👤 PARTY B PERSPECTIVE")
    for favorability, count in analysis['perspective_summary']['party_b'].items():
        print(f"  {favorability.capitalize():12s}: {count:3d} clauses")

    print(f"\n📋 CLAUSE TYPE BREAKDOWN")
    sorted_clauses = sorted(analysis['clause_types'].items(), key=lambda x: x[1], reverse=True)
    for clause_type, count in sorted_clauses[:10]:
        print(f"  {clause_type:25s}: {count:3d}")

    print(f"\n🔍 HIGH-RISK CLAUSES (Sample)")
    print("-"*80)
    high_risk_clauses = [c for c in analysis['clause_analyses'] if c['risk_level'] == 'high']
    for i, clause in enumerate(high_risk_clauses[:3], 1):
        print(f"\n  [{i}] {clause['clause_type'].upper()}")
        print(f"      Party A: {clause['party_a_favorability']} (score: {clause['party_a_score']:+d})")
        print(f"      Party B: {clause['party_b_favorability']} (score: {clause['party_b_score']:+d})")
        print(f"      Text: {clause['text']}...")

    print("\n" + "="*80)

for i in range(min(3, len(comprehensive_analyses))):
    generate_contract_report(comprehensive_analyses[i])


CONTRACT ANALYSIS REPORT: N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT

📊 SUMMARY STATISTICS
  Total Segments Analyzed: 25

⚠️  RISK ASSESSMENT
  HIGH      :  11 clauses
  MEDIUM    :  12 clauses
  LOW       :   2 clauses

👤 PARTY A PERSPECTIVE
  Favorable   :  11 clauses
  Unfavorable :   4 clauses
  Neutral     :  10 clauses

👤 PARTY B PERSPECTIVE
  Favorable   :   4 clauses
  Unfavorable :   9 clauses
  Neutral     :  12 clauses

📋 CLAUSE TYPE BREAKDOWN
  confidentiality          :  11
  termination              :   7
  liability                :   4
  warranty                 :   1
  payment                  :   1
  governing_law            :   1

🔍 HIGH-RISK CLAUSES (Sample)
--------------------------------------------------------------------------------

  [1] TERMINATION
      Party A: neutral (score: +0)
      Party B: unfavorable (score: -1)
      Text: Notwithstanding the foregoing, Excite may make available                  opportunities on the Excite Site to purchase M

## Advanced Visualizations

In [35]:
print("="*80)
print("ADVANCED VISUALIZATIONS")
print("="*80)

all_clauses_flat = []
for analysis in comprehensive_analyses:
    for clause in analysis['clause_analyses']:
        all_clauses_flat.append({
            'contract_id': analysis['contract_id'],
            'clause_type': clause['clause_type'],
            'risk_level': clause['risk_level'],
            'party_a_favorability': clause['party_a_favorability'],
            'party_b_favorability': clause['party_b_favorability'],
            'party_a_score': clause['party_a_score'],
            'party_b_score': clause['party_b_score']
        })

clauses_df = pd.DataFrame(all_clauses_flat)

print(f"Created analysis dataframe with {len(clauses_df)} clauses")

risk_clause_matrix = pd.crosstab(clauses_df['clause_type'], clauses_df['risk_level'])

fig = go.Figure(data=go.Heatmap(
    z=risk_clause_matrix.values,
    x=risk_clause_matrix.columns,
    y=risk_clause_matrix.index,
    colorscale='RdYlGn_r',
    text=risk_clause_matrix.values,
    texttemplate='%{text}',
    textfont={"size": 12},
    colorbar=dict(title="Count")
))

fig.update_layout(
    title='Risk Level by Clause Type Heatmap',
    xaxis_title='Risk Level',
    yaxis_title='Clause Type',
    height=600,
    width=900
)
fig.show()

party_a_clause = pd.crosstab(clauses_df['clause_type'], clauses_df['party_a_favorability'])
party_b_clause = pd.crosstab(clauses_df['clause_type'], clauses_df['party_b_favorability'])

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Party A Favorability by Clause Type', 'Party B Favorability by Clause Type'),
    specs=[[{"type": "bar"}, {"type": "bar"}]]
)

for col in party_a_clause.columns:
    fig.add_trace(
        go.Bar(name=col, x=party_a_clause.index, y=party_a_clause[col]),
        row=1, col=1
    )

for col in party_b_clause.columns:
    fig.add_trace(
        go.Bar(name=col, x=party_b_clause.index, y=party_b_clause[col], showlegend=False),
        row=1, col=2
    )

fig.update_layout(
    height=600,
    barmode='stack',
    title_text="Perspective Analysis by Clause Type"
)
fig.update_xaxes(tickangle=-45)
fig.show()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=clauses_df['party_a_score'],
    y=clauses_df['party_b_score'],
    mode='markers',
    marker=dict(
        size=8,
        color=clauses_df['risk_level'].map({'high': 2, 'medium': 1, 'low': 0}),
        colorscale='Reds',
        showscale=True,
        colorbar=dict(
            title="Risk",
            tickvals=[0, 1, 2],
            ticktext=['Low', 'Medium', 'High']
        )
    ),
    text=clauses_df['clause_type'],
    hovertemplate='<b>%{text}</b><br>Party A Score: %{x}<br>Party B Score: %{y}<extra></extra>'
))

fig.update_layout(
    title='Party A vs Party B Favorability Scores',
    xaxis_title='Party A Score',
    yaxis_title='Party B Score',
    height=600,
    width=800
)
fig.show()

risk_by_contract = []
for analysis in comprehensive_analyses:
    total = sum(analysis['risk_summary'].values())
    if total > 0:
        risk_by_contract.append({
            'contract_id': analysis['contract_id'][:20],
            'high_pct': (analysis['risk_summary']['high'] / total) * 100,
            'medium_pct': (analysis['risk_summary']['medium'] / total) * 100,
            'low_pct': (analysis['risk_summary']['low'] / total) * 100
        })

risk_contract_df = pd.DataFrame(risk_by_contract)

fig = go.Figure()

fig.add_trace(go.Bar(
    name='High Risk',
    x=risk_contract_df['contract_id'],
    y=risk_contract_df['high_pct'],
    marker_color='#FF6B6B'
))

fig.add_trace(go.Bar(
    name='Medium Risk',
    x=risk_contract_df['contract_id'],
    y=risk_contract_df['medium_pct'],
    marker_color='#FFA500'
))

fig.add_trace(go.Bar(
    name='Low Risk',
    x=risk_contract_df['contract_id'],
    y=risk_contract_df['low_pct'],
    marker_color='#90EE90'
))

fig.update_layout(
    barmode='stack',
    title='Risk Distribution by Contract',
    xaxis_title='Contract ID',
    yaxis_title='Percentage (%)',
    height=600,
    xaxis_tickangle=-45
)
fig.show()

print("\n✓ Advanced visualizations complete")

ADVANCED VISUALIZATIONS
Created analysis dataframe with 382 clauses



✓ Advanced visualizations complete


## Export Results & Summary

In [36]:
print("="*80)
print("EXPORTING RESULTS")
print("="*80)

clauses_df.to_csv('clause_analysis_results.csv', index=False)
print("✓ Saved: clause_analysis_results.csv")

entities_df.to_csv('ner_extraction_results.csv', index=False)
print("✓ Saved: ner_extraction_results.csv")

contracts_summary = contracts_df[['contract_id', 'word_count', 'num_segments']].copy()
contracts_summary.to_csv('contracts_summary.csv', index=False)
print("✓ Saved: contracts_summary.csv")

summary_stats = {
    'total_contracts_analyzed': len(comprehensive_analyses),
    'total_clauses_analyzed': len(clauses_df),
    'unique_clause_types': clauses_df['clause_type'].nunique(),
    'high_risk_clauses': len(clauses_df[clauses_df['risk_level'] == 'high']),
    'medium_risk_clauses': len(clauses_df[clauses_df['risk_level'] == 'medium']),
    'low_risk_clauses': len(clauses_df[clauses_df['risk_level'] == 'low']),
    'party_a_favorable': len(clauses_df[clauses_df['party_a_favorability'] == 'favorable']),
    'party_a_unfavorable': len(clauses_df[clauses_df['party_a_favorability'] == 'unfavorable']),
    'party_b_favorable': len(clauses_df[clauses_df['party_b_favorability'] == 'favorable']),
    'party_b_unfavorable': len(clauses_df[clauses_df['party_b_favorability'] == 'unfavorable']),
    'avg_entities_per_contract': entities_df[['num_persons', 'num_orgs', 'num_locations']].sum(axis=1).mean()
}

with open('analysis_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print("✓ Saved: analysis_summary.json")

print("\n" + "="*80)
print("FINAL SUMMARY STATISTICS")
print("="*80)

for key, value in summary_stats.items():
    if isinstance(value, float):
        print(f"{key.replace('_', ' ').title():.<50} {value:.2f}")
    else:
        print(f"{key.replace('_', ' ').title():.<50} {value}")

fig = go.Figure()

categories = ['High Risk', 'Medium Risk', 'Low Risk',
              'Party A Fav', 'Party A Unfav',
              'Party B Fav', 'Party B Unfav']

values = [
    summary_stats['high_risk_clauses'],
    summary_stats['medium_risk_clauses'],
    summary_stats['low_risk_clauses'],
    summary_stats['party_a_favorable'],
    summary_stats['party_a_unfavorable'],
    summary_stats['party_b_favorable'],
    summary_stats['party_b_unfavorable']
]

colors = ['#FF6B6B', '#FFA500', '#90EE90', '#4ECDC4', '#FF6B6B', '#95E1D3', '#F38181']

fig.add_trace(go.Bar(
    x=categories,
    y=values,
    marker_color=colors,
    text=values,
    textposition='auto'
))

fig.update_layout(
    title='Complete Analysis Summary',
    xaxis_title='Category',
    yaxis_title='Count',
    height=500,
    showlegend=False
)
fig.show()

print("\n" + "="*80)
print("✅ ANALYSIS COMPLETE")
print("="*80)

print("""
Files Generated:
  📄 clause_analysis_results.csv - Detailed clause-level analysis
  📄 ner_extraction_results.csv - Named entity recognition results
  📄 contracts_summary.csv - Contract metadata summary
  📄 analysis_summary.json - Overall statistics
""")

EXPORTING RESULTS
✓ Saved: clause_analysis_results.csv
✓ Saved: ner_extraction_results.csv
✓ Saved: contracts_summary.csv
✓ Saved: analysis_summary.json

FINAL SUMMARY STATISTICS
Total Contracts Analyzed.......................... 20
Total Clauses Analyzed............................ 382
Unique Clause Types............................... 10
High Risk Clauses................................. 217
Medium Risk Clauses............................... 120
Low Risk Clauses.................................. 45
Party A Favorable................................. 94
Party A Unfavorable............................... 139
Party B Favorable................................. 77
Party B Unfavorable............................... 97
Avg Entities Per Contract......................... 45.30



✅ ANALYSIS COMPLETE

Files Generated:
  📄 clause_analysis_results.csv - Detailed clause-level analysis
  📄 ner_extraction_results.csv - Named entity recognition results
  📄 contracts_summary.csv - Contract metadata summary
  📄 analysis_summary.json - Overall statistics

Key Achievements:
  ✓ Zero-shot NER extraction completed
  ✓ Multi-perspective risk analysis performed
  ✓ Clause type classification completed
  ✓ Party A and Party B favorability assessed
  ✓ Comprehensive visualizations generated



## Interactive Contract Explorer

In [37]:
print("="*80)
print("INTERACTIVE CONTRACT EXPLORER")
print("="*80)

def explore_contract_interactive(contract_index):
    if contract_index >= len(comprehensive_analyses):
        print(f"Invalid index. Max index: {len(comprehensive_analyses)-1}")
        return

    analysis = comprehensive_analyses[contract_index]
    contract_id = analysis['contract_id']

    print(f"\n{'='*80}")
    print(f"EXPLORING CONTRACT: {contract_id}")
    print(f"{'='*80}")

    print(f"\n📊 Quick Stats:")
    print(f"  Total Clauses: {analysis['total_segments_analyzed']}")
    print(f"  High Risk: {analysis['risk_summary']['high']}")
    print(f"  Medium Risk: {analysis['risk_summary']['medium']}")
    print(f"  Low Risk: {analysis['risk_summary']['low']}")

    clauses = analysis['clause_analyses']

    high_risk = [c for c in clauses if c['risk_level'] == 'high']
    party_a_unfav = [c for c in clauses if c['party_a_favorability'] == 'unfavorable']
    party_b_unfav = [c for c in clauses if c['party_b_favorability'] == 'unfavorable']

    print(f"\n⚠️  HIGH RISK CLAUSES ({len(high_risk)}):")
    for i, clause in enumerate(high_risk[:5], 1):
        print(f"\n  [{i}] Type: {clause['clause_type']}")
        print(f"      Text: {clause['text'][:150]}...")

    print(f"\n🔴 PARTY A UNFAVORABLE CLAUSES ({len(party_a_unfav)}):")
    for i, clause in enumerate(party_a_unfav[:3], 1):
        print(f"\n  [{i}] Type: {clause['clause_type']} | Risk: {clause['risk_level']}")
        print(f"      Text: {clause['text'][:150]}...")

    print(f"\n🔴 PARTY B UNFAVORABLE CLAUSES ({len(party_b_unfav)}):")
    for i, clause in enumerate(party_b_unfav[:3], 1):
        print(f"\n  [{i}] Type: {clause['clause_type']} | Risk: {clause['risk_level']}")
        print(f"      Text: {clause['text'][:150]}...")

    clause_data = pd.DataFrame(clauses)

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Risk Distribution',
            'Party A Favorability',
            'Party B Favorability',
            'Clause Types'
        ),
        specs=[
            [{"type": "pie"}, {"type": "pie"}],
            [{"type": "pie"}, {"type": "bar"}]
        ]
    )

    risk_counts = clause_data['risk_level'].value_counts()
    fig.add_trace(
        go.Pie(labels=risk_counts.index, values=risk_counts.values,
               marker=dict(colors=['#FF6B6B', '#FFA500', '#90EE90'])),
        row=1, col=1
    )

    party_a_counts = clause_data['party_a_favorability'].value_counts()
    fig.add_trace(
        go.Pie(labels=party_a_counts.index, values=party_a_counts.values),
        row=1, col=2
    )

    party_b_counts = clause_data['party_b_favorability'].value_counts()
    fig.add_trace(
        go.Pie(labels=party_b_counts.index, values=party_b_counts.values),
        row=2, col=1
    )

    clause_type_counts = clause_data['clause_type'].value_counts().head(10)
    fig.add_trace(
        go.Bar(x=clause_type_counts.index, y=clause_type_counts.values),
        row=2, col=2
    )

    fig.update_layout(
        height=800,
        showlegend=True,
        title_text=f"Contract Analysis: {contract_id}"
    )
    fig.update_xaxes(tickangle=-45, row=2, col=2)
    fig.show()

for i in range(min(2, len(comprehensive_analyses))):
    explore_contract_interactive(i)

print("\n✓ Interactive exploration complete")

INTERACTIVE CONTRACT EXPLORER

EXPLORING CONTRACT: N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT

📊 Quick Stats:
  Total Clauses: 25
  High Risk: 11
  Medium Risk: 12
  Low Risk: 2

⚠️  HIGH RISK CLAUSES (11):

  [1] Type: termination
      Text: Notwithstanding the foregoing, Excite may make available                  opportunities on the Excite Site to purchase Music Products                 ...

  [2] Type: termination
      Text: b)       Sponsor will pay Excite [****] per year as compensation for                   being the exclusive online retail music store sponsor of the   ...

  [3] Type: termination
      Text: This                            three-month Total Revenue amount will be compared to                            an amount equal to two (2) times the c...

  [4] Type: termination
      Text: The fees charged by such Certified Public Accountant in                   connection with the inspection will be paid by Excite unless                ...

  [5] Type: liability



EXPLORING CONTRACT: PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELLECTUAL PROPERTY AGREEMENT

📊 Quick Stats:
  Total Clauses: 16
  High Risk: 5
  Medium Risk: 9
  Low Risk: 2

⚠️  HIGH RISK CLAUSES (5):

  [1] Type: liability
      Text: Exhibit 10.2   INTELLECTUAL PROPERTY AGREEMENT   This Intellectual Property Agreement (this "Agreement") is entered into on May 12, 2020 ("Effective D...

  [2] Type: termination
      Text: Within thirty (30) days after the end of each quarterly period ending on March                 31st, June 30th, September 30th, or December 31st, comm...

  [3] Type: termination
      Text: The notice will identify (i) the Application or Patent, (ii) the country, (iii) the reason for the IP Cost, and (iv) the Due Date for payment. THI sha...

  [4] Type: termination
      Text: If THI does not make the obligatory payments as stated in 3(a) by the dates stated, the Exclusive License will revert back to Premier, provided, howev...

  [5] Type: liability
      Text: 


✓ Interactive exploration complete


## Comparative Analysis

In [38]:
print("="*80)
print("COMPARATIVE ANALYSIS ACROSS CONTRACTS")
print("="*80)

comparison_data = []

for analysis in comprehensive_analyses:
    total_clauses = analysis['total_segments_analyzed']
    if total_clauses == 0:
        continue

    comparison_data.append({
        'contract_id': analysis['contract_id'][:15],
        'total_clauses': total_clauses,
        'high_risk_pct': (analysis['risk_summary']['high'] / total_clauses) * 100,
        'medium_risk_pct': (analysis['risk_summary']['medium'] / total_clauses) * 100,
        'low_risk_pct': (analysis['risk_summary']['low'] / total_clauses) * 100,
        'party_a_favorable_pct': (analysis['perspective_summary']['party_a']['favorable'] / total_clauses) * 100,
        'party_a_unfavorable_pct': (analysis['perspective_summary']['party_a']['unfavorable'] / total_clauses) * 100,
        'party_b_favorable_pct': (analysis['perspective_summary']['party_b']['favorable'] / total_clauses) * 100,
        'party_b_unfavorable_pct': (analysis['perspective_summary']['party_b']['unfavorable'] / total_clauses) * 100,
        'risk_score': (analysis['risk_summary']['high'] * 3 + analysis['risk_summary']['medium'] * 2 + analysis['risk_summary']['low'] * 1) / total_clauses
    })

comparison_df = pd.DataFrame(comparison_data)

print(f"\nContracts Compared: {len(comparison_df)}")
print(f"\nRisk Score Statistics:")
print(f"  Mean: {comparison_df['risk_score'].mean():.2f}")
print(f"  Median: {comparison_df['risk_score'].median():.2f}")
print(f"  Std Dev: {comparison_df['risk_score'].std():.2f}")
print(f"  Min: {comparison_df['risk_score'].min():.2f}")
print(f"  Max: {comparison_df['risk_score'].max():.2f}")

print(f"\nTop 5 Highest Risk Contracts:")
top_risk = comparison_df.nlargest(5, 'risk_score')
for idx, row in top_risk.iterrows():
    print(f"  {row['contract_id']:20s} | Risk Score: {row['risk_score']:.2f} | High Risk: {row['high_risk_pct']:.1f}%")

print(f"\nTop 5 Lowest Risk Contracts:")
low_risk = comparison_df.nsmallest(5, 'risk_score')
for idx, row in low_risk.iterrows():
    print(f"  {row['contract_id']:20s} | Risk Score: {row['risk_score']:.2f} | High Risk: {row['high_risk_pct']:.1f}%")

fig = px.scatter(
    comparison_df,
    x='party_a_favorable_pct',
    y='party_b_favorable_pct',
    size='total_clauses',
    color='risk_score',
    hover_data=['contract_id', 'high_risk_pct'],
    title='Party A vs Party B Favorability (Contract Level)',
    labels={
        'party_a_favorable_pct': 'Party A Favorable (%)',
        'party_b_favorable_pct': 'Party B Favorable (%)',
        'risk_score': 'Risk Score'
    },
    color_continuous_scale='Reds'
)
fig.update_layout(height=600, width=900)
fig.show()

fig = go.Figure()

fig.add_trace(go.Box(
    y=comparison_df['high_risk_pct'],
    name='High Risk %',
    marker_color='#FF6B6B'
))

fig.add_trace(go.Box(
    y=comparison_df['medium_risk_pct'],
    name='Medium Risk %',
    marker_color='#FFA500'
))

fig.add_trace(go.Box(
    y=comparison_df['low_risk_pct'],
    name='Low Risk %',
    marker_color='#90EE90'
))

fig.update_layout(
    title='Risk Distribution Across All Contracts',
    yaxis_title='Percentage (%)',
    height=500
)
fig.show()

fig = go.Figure()

sorted_comp = comparison_df.sort_values('risk_score', ascending=False)

fig.add_trace(go.Bar(
    x=sorted_comp['contract_id'],
    y=sorted_comp['risk_score'],
    marker=dict(
        color=sorted_comp['risk_score'],
        colorscale='Reds',
        showscale=True,
        colorbar=dict(title="Risk Score")
    ),
    text=sorted_comp['risk_score'].round(2),
    textposition='auto'
))

fig.update_layout(
    title='Risk Score Ranking Across Contracts',
    xaxis_title='Contract ID',
    yaxis_title='Risk Score',
    height=600,
    xaxis_tickangle=-45
)
fig.show()

comparison_df.to_csv('contract_comparison.csv', index=False)
print("\n✓ Saved: contract_comparison.csv")
print("✓ Comparative analysis complete")

COMPARATIVE ANALYSIS ACROSS CONTRACTS

Contracts Compared: 20

Risk Score Statistics:
  Mean: 2.45
  Median: 2.40
  Std Dev: 0.19
  Min: 2.19
  Max: 2.80

Top 5 Highest Risk Contracts:
  IpassInc_201812      | Risk Score: 2.80 | High Risk: 83.3%
  ScansourceInc_2      | Risk Score: 2.78 | High Risk: 77.8%
  GAINSCOINC_01_2      | Risk Score: 2.75 | High Risk: 83.3%
  TodosMedicalLtd      | Risk Score: 2.68 | High Risk: 68.2%
  MANAKOASERVICES      | Risk Score: 2.55 | High Risk: 63.6%

Top 5 Lowest Risk Contracts:
  PREMIERBIOMEDIC      | Risk Score: 2.19 | High Risk: 31.2%
  HertzGroupRealt      | Risk Score: 2.20 | High Risk: 60.0%
  EbixInc_2001051      | Risk Score: 2.26 | High Risk: 40.7%
  IGENEBIOTECHNOL      | Risk Score: 2.27 | High Risk: 40.0%
  IntegrityFunds_      | Risk Score: 2.32 | High Risk: 57.9%



✓ Saved: contract_comparison.csv
✓ Comparative analysis complete


## Final Dashboard & Risk Management Recommendations

In [47]:
print("="*80)
print("FINAL COMPREHENSIVE DASHBOARD")
print("="*80)

fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=(
        'Overall Risk Distribution',
        'Party A Perspective',
        'Party B Perspective',
        'Clause Type Frequency',
        'Risk by Clause Type',
        'Entity Extraction Stats',
        'Risk Score Distribution',
        'Favorability Comparison',
        'Contract Analysis Coverage'
    ),
    specs=[
        [{"type": "pie"}, {"type": "pie"}, {"type": "pie"}],
        [{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
        [{"type": "histogram"}, {"type": "scatter"}, {"type": "bar"}]
    ]
)

fig.add_trace(
    go.Pie(labels=list(risk_totals.keys()), values=list(risk_totals.values()),
           marker=dict(colors=['#FF6B6B', '#FFA500', '#90EE90'])),
    row=1, col=1
)

fig.add_trace(
    go.Pie(labels=list(party_a_totals.keys()), values=list(party_a_totals.values())),
    row=1, col=2
)

fig.add_trace(
    go.Pie(labels=list(party_b_totals.keys()), values=list(party_b_totals.values())),
    row=1, col=3
)

top_10_clauses = all_clause_types.most_common(10)
fig.add_trace(
    go.Bar(x=[c[0] for c in top_10_clauses], y=[c[1] for c in top_10_clauses]),
    row=2, col=1
)

fig.add_trace(
    go.Bar(x=risk_clause_matrix.index, y=risk_clause_matrix['high'], name='High'),
    row=2, col=2
)

entity_types = ['Persons', 'Orgs', 'Locations', 'Dates', 'Money']
entity_avgs = [
    entities_df['num_persons'].mean(),
    entities_df['num_orgs'].mean(),
    entities_df['num_locations'].mean(),
    entities_df['num_dates'].mean(),
    entities_df['num_money'].mean()
]
fig.add_trace(
    go.Bar(x=entity_types, y=entity_avgs),
    row=2, col=3
)

fig.add_trace(
    go.Histogram(x=comparison_df['risk_score'], nbinsx=20),
    row=3, col=1
)

fig.add_trace(
    go.Scatter(
        x=comparison_df['party_a_favorable_pct'],
        y=comparison_df['party_b_favorable_pct'],
        mode='markers',
        marker=dict(size=8, color=comparison_df['risk_score'], colorscale='Reds')
    ),
    row=3, col=2
)

analysis_stats = [len(comprehensive_analyses), len(clauses_df), clauses_df['clause_type'].nunique()]
fig.add_trace(
    go.Bar(x=['Contracts', 'Clauses', 'Clause Types'], y=analysis_stats),
    row=3, col=3
)

fig.update_layout(
    height=1400,
    showlegend=False,
    title_text="Complete Legal Contract Analysis Dashboard"
)
fig.update_xaxes(tickangle=-45)
fig.show()

print("\n" + "="*80)
print("RECOMMENDATIONS & INSIGHTS")
print("="*80)

high_risk_pct = (risk_totals['high'] / sum(risk_totals.values())) * 100
party_a_unfav_pct = (party_a_totals['unfavorable'] / sum(party_a_totals.values())) * 100
party_b_unfav_pct = (party_b_totals['unfavorable'] / sum(party_b_totals.values())) * 100

recommendations = []

if high_risk_pct > 30:
    recommendations.append({
        'priority': 'HIGH',
        'category': 'Risk Management',
        'finding': f'{high_risk_pct:.1f}% of clauses are high-risk',
        'action': 'Conduct thorough legal review before signing any contracts'
    })

if party_a_unfav_pct > 40:
    recommendations.append({
        'priority': 'MEDIUM',
        'category': 'Party A Position',
        'finding': f'{party_a_unfav_pct:.1f}% of clauses are unfavorable to Party A',
        'action': 'Negotiate better terms or add protective clauses for Party A'
    })

if party_b_unfav_pct > 40:
    recommendations.append({
        'priority': 'MEDIUM',
        'category': 'Party B Position',
        'finding': f'{party_b_unfav_pct:.1f}% of clauses are unfavorable to Party B',
        'action': 'Consider revising terms to balance obligations for Party B'
    })

termination_count = all_clause_types.get('termination', 0)
if termination_count > sum(all_clause_types.values()) * 0.15:
    recommendations.append({
        'priority': 'MEDIUM',
        'category': 'Termination Clauses',
        'finding': f'High frequency of termination clauses ({termination_count})',
        'action': 'Review termination conditions and notice periods carefully'
    })

liability_count = all_clause_types.get('liability', 0)
if liability_count > 20:
    recommendations.append({
        'priority': 'HIGH',
        'category': 'Liability',
        'finding': f'Significant liability clauses present ({liability_count})',
        'action': 'Assess liability limits and indemnification provisions'
    })

for i, rec in enumerate(recommendations, 1):
    print(f"\n[{rec['priority']}] Recommendation {i}: {rec['category']}")
    print(f"    Finding: {rec['finding']}")
    print(f"    Action: {rec['action']}")

if not recommendations:
    print("\n✓ No critical issues identified")
    print("✓ Contracts appear balanced with acceptable risk levels")

with open('recommendations.json', 'w') as f:
    json.dump(recommendations, f, indent=2)

print("\n✓ Saved: recommendations.json")

print("\n" + "="*80)
print("COMPLETE ANALYSIS FINISHED")
print("="*80)

print(f"""
Summary of Deliverables:
  ✅ {len(comprehensive_analyses)} contracts analyzed
  ✅ {len(clauses_df)} clauses classified
  ✅ {len(entities_df)} contracts processed for NER
  ✅ {clauses_df['clause_type'].nunique()} unique clause types identified
  ✅ {len(recommendations)} actionable recommendations generated

Files Available:
  📄 clause_analysis_results.csv
  📄 ner_extraction_results.csv
  📄 contracts_summary.csv
  📄 contract_comparison.csv
  📄 analysis_summary.json
  📄 recommendations.json

All visualizations displayed inline above.
""")

FINAL COMPREHENSIVE DASHBOARD



RECOMMENDATIONS & INSIGHTS

[HIGH] Recommendation 1: Risk Management
    Finding: 56.8% of clauses are high-risk
    Action: Conduct thorough legal review before signing any contracts

[MEDIUM] Recommendation 2: Termination Clauses
    Finding: High frequency of termination clauses (144)
    Action: Review termination conditions and notice periods carefully

[HIGH] Recommendation 3: Liability
    Finding: Significant liability clauses present (62)
    Action: Assess liability limits and indemnification provisions

✓ Saved: recommendations.json

COMPLETE ANALYSIS FINISHED

Summary of Deliverables:
  ✅ 20 contracts analyzed
  ✅ 382 clauses classified
  ✅ 50 contracts processed for NER
  ✅ 10 unique clause types identified
  ✅ 3 actionable recommendations generated

Files Available:
  📄 clause_analysis_results.csv
  📄 ner_extraction_results.csv
  📄 contracts_summary.csv
  📄 contract_comparison.csv
  📄 analysis_summary.json
  📄 recommendations.json

All visualizations displayed inline abo

## Visualizations for generated files:

In [41]:
# Preview and visualize clause_analysis_results.csv
try:
    print("📄 Preview of clause_analysis_results.csv:")
    clauses_df = pd.read_csv('clause_analysis_results.csv')
    display(clauses_df.head())

    print("\n📊 Visualization: Risk Level Distribution")
    risk_counts = clauses_df['risk_level'].value_counts()
    fig = px.pie(
        values=risk_counts.values,
        names=risk_counts.index,
        title='Risk Level Distribution',
        color_discrete_map={'high': '#FF6B6B', 'medium': '#FFA500', 'low': '#90EE90'}
    )
    fig.update_layout(height=300, width=400, margin=dict(l=20, r=20, t=40, b=20))
    fig.show()

except FileNotFoundError:
    print("clause_analysis_results.csv not found.")

📄 Preview of clause_analysis_results.csv:


Unnamed: 0,contract_id,clause_type,risk_level,party_a_favorability,party_b_favorability,party_a_score,party_b_score
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,confidentiality,medium,favorable,neutral,1,0
1,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,confidentiality,medium,neutral,neutral,0,0
2,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,termination,high,neutral,unfavorable,0,-1
3,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,confidentiality,medium,favorable,favorable,2,1
4,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,confidentiality,medium,unfavorable,neutral,-1,0



📊 Visualization: Risk Level Distribution


In [42]:
# Preview and visualize ner_extraction_results.csv
try:
    print("📄 Preview of ner_extraction_results.csv:")
    entities_df = pd.read_csv('ner_extraction_results.csv')
    display(entities_df.head())

    print("\n📊 Visualization: Average Entities per Contract (Sample)")
    entity_avgs = entities_df[['num_persons', 'num_orgs', 'num_locations', 'num_dates', 'num_money']].mean()
    fig = px.bar(
        x=entity_avgs.index,
        y=entity_avgs.values,
        title='Average Entities per Contract (Sample)',
        labels={'x': 'Entity Type', 'y': 'Average Count'}
    )
    fig.update_layout(height=300, width=400, margin=dict(l=20, r=20, t=40, b=20))
    fig.show()

except FileNotFoundError:
    print("ner_extraction_results.csv not found.")

📄 Preview of ner_extraction_results.csv:


Unnamed: 0,contract_id,entities,num_persons,num_orgs,num_locations,num_dates,num_money,num_other
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,"{'PERSON': [], 'ORG': [{'text': 'Commission', ...",0,39,9,0,0,16
1,PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL...,"{'PERSON': [], 'ORG': [{'text': 'Mar', 'score'...",0,18,5,0,0,35
2,IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1...,"{'PERSON': [], 'ORG': [{'text': 'Securities an...",0,62,8,0,0,18
3,PacificapEntertainmentHoldingsInc_20051115_8-K...,"{'PERSON': [], 'ORG': [{'text': '##CI', 'score...",0,2,10,0,0,8
4,"FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...","{'PERSON': [], 'ORG': [{'text': 'E', 'score': ...",0,25,5,0,0,10



📊 Visualization: Average Entities per Contract (Sample)


In [43]:
# Preview and visualize contracts_summary.csv
try:
    print("📄 Preview of contracts_summary.csv:")
    contracts_summary_df = pd.read_csv('contracts_summary.csv')
    display(contracts_summary_df.head())

    print("\n📊 Visualization: Contract Word Count Distribution (Sample)")
    fig = px.histogram(
        contracts_summary_df,
        x='word_count',
        nbins=30,
        title='Word Count Distribution (Sample)',
        labels={'word_count': 'Word Count'}
    )
    fig.update_layout(height=300, width=400, margin=dict(l=20, r=20, t=40, b=20))
    fig.show()

except FileNotFoundError:
    print("contracts_summary.csv not found.")

📄 Preview of contracts_summary.csv:


Unnamed: 0,contract_id,word_count,num_segments
0,N2KINC_10_16_1997-EX-10.16-SPONSORSHIP AGREEMENT,5558,25
1,PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL...,3596,16
2,IpassInc_20181203_8-K_EX-99.1_11445874_EX-99.1...,11394,49
3,PacificapEntertainmentHoldingsInc_20051115_8-K...,2500,12
4,"FTENETWORKS,INC_02_18_2016-EX-99.4-STRATEGIC A...",5761,26



📊 Visualization: Contract Word Count Distribution (Sample)


In [44]:
# Preview and visualize contract_comparison.csv
try:
    print("📄 Preview of contract_comparison.csv:")
    contract_comparison_df = pd.read_csv('contract_comparison.csv')
    display(contract_comparison_df.head())

    print("\n📊 Visualization: Risk Score Distribution Across Contracts (Sample)")
    fig = px.histogram(
        contract_comparison_df,
        x='risk_score',
        nbins=20,
        title='Risk Score Distribution (Sample)',
        labels={'risk_score': 'Risk Score'}
    )
    fig.update_layout(height=300, width=400, margin=dict(l=20, r=20, t=40, b=20))
    fig.show()

except FileNotFoundError:
    print("contract_comparison.csv not found.")

📄 Preview of contract_comparison.csv:


Unnamed: 0,contract_id,total_clauses,high_risk_pct,medium_risk_pct,low_risk_pct,party_a_favorable_pct,party_a_unfavorable_pct,party_b_favorable_pct,party_b_unfavorable_pct,risk_score
0,N2KINC_10_16_19,25,44.0,48.0,8.0,44.0,16.0,16.0,36.0,2.36
1,PREMIERBIOMEDIC,16,31.25,56.25,12.5,31.25,37.5,62.5,18.75,2.1875
2,IpassInc_201812,30,83.333333,13.333333,3.333333,30.0,36.666667,6.666667,23.333333,2.8
3,PacificapEntert,12,50.0,33.333333,16.666667,16.666667,25.0,16.666667,50.0,2.333333
4,"FTENETWORKS,INC",26,61.538462,26.923077,11.538462,11.538462,61.538462,34.615385,15.384615,2.5



📊 Visualization: Risk Score Distribution Across Contracts (Sample)


In [45]:
# Preview analysis_summary.json
try:
    print("📄 Preview of analysis_summary.json:")
    with open('analysis_summary.json', 'r') as f:
        analysis_summary_data = json.load(f)
    display(analysis_summary_data)

except FileNotFoundError:
    print("analysis_summary.json not found.")
except json.JSONDecodeError:
    print("Error decoding analysis_summary.json.")

📄 Preview of analysis_summary.json:


{'total_contracts_analyzed': 20,
 'total_clauses_analyzed': 382,
 'unique_clause_types': 10,
 'high_risk_clauses': 217,
 'medium_risk_clauses': 120,
 'low_risk_clauses': 45,
 'party_a_favorable': 94,
 'party_a_unfavorable': 139,
 'party_b_favorable': 77,
 'party_b_unfavorable': 97,
 'avg_entities_per_contract': 45.3}

In [46]:
# Preview recommendations.json
try:
    print("📄 Preview of recommendations.json:")
    with open('recommendations.json', 'r') as f:
        recommendations_data = json.load(f)
    display(recommendations_data)

except FileNotFoundError:
    print("recommendations.json not found.")
except json.JSONDecodeError:
    print("Error decoding recommendations.json.")

📄 Preview of recommendations.json:


[{'priority': 'HIGH',
  'category': 'Risk Management',
  'finding': '56.8% of clauses are high-risk',
  'action': 'Conduct thorough legal review before signing any contracts'},
 {'priority': 'MEDIUM',
  'category': 'Termination Clauses',
  'finding': 'High frequency of termination clauses (144)',
  'action': 'Review termination conditions and notice periods carefully'},
 {'priority': 'HIGH',
  'category': 'Liability',
  'finding': 'Significant liability clauses present (62)',
  'action': 'Assess liability limits and indemnification provisions'}]