In [None]:
import pandas as pd
import numpy as np
import random
import json
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    # Also check for punkt_tab
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    # Download both punkt and punkt_tab if either is missing
    nltk.download('punkt')
    nltk.download('punkt_tab')


# Load the dataset
try:
    # Try reading as Excel file first
    df = pd.read_excel("/content/LegalRiskDataset.xlsx")
except:
    # If that fails, try CSV
    df = pd.read_csv("/content/LegalRiskDataset.xlsx", on_bad_lines='skip')

print(f'Original dataset size: {df.shape[0]} rows')

# Convert string representations of dictionaries to actual dictionaries if needed
def convert_risk_level(risk_str):
    if isinstance(risk_str, str):
        try:
            return json.loads(risk_str.replace("'", "\""))
        except:
            # If it already looks like a dict representation but json.loads fails
            risk_dict = {}
            pairs = risk_str.strip('{}').split(', ')
            for pair in pairs:
                if ': ' in pair:
                    key, value = pair.split(': ')
                    key = key.strip("'")
                    value = value.strip("'")
                    risk_dict[key] = value
            return risk_dict
    return risk_str  # Already a dictionary or other format

df['RiskLevel'] = df['RiskLevel'].apply(convert_risk_level)

# Extract and analyze the original data
paragraphs = df['Paragraph'].tolist()
risk_levels = df['RiskLevel'].tolist()

# Create word bank for each risk level
low_risk_words = set()
medium_risk_words = set()
high_risk_words = set()

for risk_dict in risk_levels:
    if isinstance(risk_dict, dict):
        for word, level in risk_dict.items():
            if level.lower() == 'low':
                low_risk_words.add(word.lower())
            elif level.lower() == 'medium':
                medium_risk_words.add(word.lower())
            elif level.lower() == 'high':
                high_risk_words.add(word.lower())

print(f"Extracted {len(low_risk_words)} low risk words")
print(f"Extracted {len(medium_risk_words)} medium risk words")
print(f"Extracted {len(high_risk_words)} high risk words")

# Extract common legal phrases from the paragraphs
legal_phrases = []
for para in paragraphs:
    words = word_tokenize(para)
    phrases = [' '.join(words[i:i+3]) for i in range(len(words)-2)]
    legal_phrases.extend(phrases)

# Function to generate a synthetic paragraph with known risk words
def generate_synthetic_paragraph(original_paragraphs, phrases, low_words, medium_words, high_words, min_length=50, max_length=200):
    # Start with a randomly selected paragraph as base
    base = random.choice(original_paragraphs)
    words = word_tokenize(base)

    # Determine target length
    target_length = random.randint(min_length, max_length)

    # If base is too long, truncate
    if len(words) > target_length:
        words = words[:target_length]

    # Create a pool of known risk words
    all_risk_words = list(low_words) + list(medium_words) + list(high_words)

    # If base is too short, add phrases and ensure we include some known risk words
    while len(words) < target_length:
        # Every so often, inject a known risk word
        if random.random() < 0.3 and all_risk_words:  # 30% chance to insert a risk word
            words.append(random.choice(all_risk_words))
        else:
            phrase = random.choice(phrases).split()
            words.extend(phrase)

    # Final trimming to target length
    words = words[:target_length]

    return ' '.join(words)

# Function to generate synthetic risk annotations - MODIFIED
def generate_synthetic_risk(paragraph, low_words, medium_words, high_words, min_words=5, max_words=8):
    words = word_tokenize(paragraph.lower())

    # Create risk dictionary only with words that have known risk levels
    risk_dict = {}

    # Check for low risk words in paragraph
    for word in set(words):
        if word in low_words:
            risk_dict[word] = 'low'
        elif word in medium_words:
            risk_dict[word] = 'medium'
        elif word in high_words:
            risk_dict[word] = 'high'

    # If we have too many risk words, randomly select a subset
    if len(risk_dict) > max_words:
        keys_to_keep = random.sample(list(risk_dict.keys()), max_words)
        risk_dict = {k: risk_dict[k] for k in keys_to_keep}

    # If we have too few risk words, we'll need to generate a new paragraph
    # The calling code will handle this case

    return risk_dict

# Generate synthetic data
target_size = 20000
num_synthetic = target_size - len(df)

print(f"Generating {num_synthetic} synthetic records...")

synthetic_paragraphs = []
synthetic_risks = []

for _ in tqdm(range(num_synthetic)):
    # Generate paragraph with known risk words
    paragraph = generate_synthetic_paragraph(paragraphs, legal_phrases, low_risk_words, medium_risk_words, high_risk_words)

    # Generate risk annotation (only for known risk words)
    risk = generate_synthetic_risk(paragraph, low_risk_words, medium_risk_words, high_risk_words)

    # Ensure we have at least a minimum number of risk annotations
    attempts = 0
    while len(risk) < 5 and attempts < 10:  # Require at least 2 risk words, try up to 5 times
        paragraph = generate_synthetic_paragraph(paragraphs, legal_phrases, low_risk_words, medium_risk_words, high_risk_words)
        risk = generate_synthetic_risk(paragraph, low_risk_words, medium_risk_words, high_risk_words)
        attempts += 1

    # Only add if we have sufficient risk annotations
    if len(risk) >= 5:
        synthetic_paragraphs.append(paragraph)
        synthetic_risks.append(risk)
    else:
        # If we couldn't generate a good paragraph after 5 attempts, add one anyway
        # but make sure it has at least one risk word
        if len(risk) > 2:
            synthetic_paragraphs.append(paragraph)
            synthetic_risks.append(risk)

# Create DataFrame with synthetic data
synthetic_df = pd.DataFrame({
    'Paragraph': synthetic_paragraphs,
    'RiskLevel': synthetic_risks
})

# If we didn't get enough samples, generate more
if len(synthetic_paragraphs) < num_synthetic:
    print(f"Generated only {len(synthetic_paragraphs)} valid samples. Continuing...")

# Combine original and synthetic data
augmented_df = pd.concat([df, synthetic_df], ignore_index=True)

print(f"Final dataset size: {augmented_df.shape[0]} rows")

# Save the augmented dataset
augmented_df.to_excel("LegalRiskDataset_Augmented.xlsx", index=False)
print("Augmented dataset saved as 'LegalRiskDataset_Augmented.xlsx'")

# Display a sample of synthetic data
print("\nSample of synthetic data:")
print(synthetic_df.head(2))

Original dataset size: 105 rows
Extracted 6 low risk words
Extracted 24 medium risk words
Extracted 20 high risk words
Generating 19895 synthetic records...


100%|██████████| 19895/19895 [00:41<00:00, 480.83it/s]


Final dataset size: 20000 rows
Augmented dataset saved as 'LegalRiskDataset_Augmented.xlsx'

Sample of synthetic data:
                                           Paragraph  \
0  This offer letter outlines the conditions unde...   
1  All parties involved , including the employer ...   

                                           RiskLevel  
0  {'letter': 'low', 'user': 'low', 'employer': '...  
1  {'letter': 'low', 'fine': 'high', 'conditions'...  
