In [None]:
!pip install transformers accelerate

import pandas as pd
from transformers import pipeline
import time

# Load Hugging Face text-generation pipeline (no token required for public models)
generator = pipeline("text2text-generation", model="google/flan-t5-large")

# Define contract categories
labels = ["NDA", "SLA", "Employment", "Vendor", "Partnership"]

# Prompts for each type
prompts = {
    "NDA": "Write a detailed Non-Disclosure Agreement contract with sections on confidentiality, obligations, exclusions, and governing law.",
    "SLA": "Write a long Service-Level Agreement with sections on uptime, response time, service credits, and dispute resolution.",
    "Employment": "Write a formal Employment Contract with sections on position, salary, duties, termination, and benefits.",
    "Vendor": "Write a Vendor Agreement with sections on deliverables, payment terms, warranties, and liability.",
    "Partnership": "Write a Partnership Agreement with sections on contributions, profit sharing, governance, and termination."
}

# Rows per label for test run
rows_per_label = 20

data = []

for label in labels:
    print(f"Generating contracts for: {label}")
    for i in range(rows_per_label):
        try:
            result = generator(
                prompts[label],
                max_new_tokens=300,
                temperature=0.7,
                top_p=0.9
            )
            contract_text = result[0]["generated_text"].strip()
            data.append([contract_text, label])

            # prevent spamming HF servers
            time.sleep(1)

        except Exception as e:
            print(f"Error at {label}-{i}: {e}")

# Save dataset
df = pd.DataFrame(data, columns=["contract_text", "label"])
df.to_csv("synthetic_contracts.csv", index=False)


In [None]:
grk=pd.read_csv('/content/legal_agreements_dataset.csv', encoding='utf-8')
grk.head(2)

In [None]:
import pandas as pd
df=pd.read_csv("/content/legal_agreements_dataset_full.csv")
df.head()

Unnamed: 0,Document_ID,Text,Category
0,DOC001,"On March 14, 2025, AlphaGrowth, a Colorado cor...",NDA
1,DOC002,"On July 31, 2025, DeltaTech, a Illinois corpor...",NDA
2,DOC003,"On April 14, 2025, IotaLLC, a Texas corporatio...",NDA
3,DOC004,"On April 11, 2025, KappaTech, a Florida corpor...",NDA
4,DOC005,"On November 04, 2025, IotaLLC, a Florida corpo...",NDA


In [None]:
df.shape

(500, 3)

In [None]:
df.iloc[1,1]

"On July 31, 2025, DeltaTech, a Illinois corporation at 152 Innovation Drive, Chicago, Illinois, and IotaLabs, a Nevada limited liability company at 128 Commerce Way, Las Vegas, Nevada, enter a covenant to protect sensitive information during software development collaboration. The terms include: 1. Protected Data Definition. 'Confidential Information' includes financial models, client contracts, software designs. 2. Recipient Duties. The receiving Party restricts access to authorized personnel and uses data only for software development collaboration. 3. Exclusions. Obligations exclude data publicly known, previously possessed, or independently developed. 4. Duration. The covenant lasts 2 years, with data return or destruction upon termination. 5. Jurisdiction. Illinois law governs, with disputes in Chicago courts. The Parties execute this covenant to safeguard discussions. Additionally, access logs for Confidential Information are maintained and provided upon request. Reverse enginee

In [None]:
df = df.rename(columns={'Text': 'Content'})

In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df=df[['Content', 'Category']]

In [None]:
df.head(6)

Unnamed: 0,Content,Category
0,"Effective June 26, 2025, EpsilonInc, a Califor...",Vendor
1,"On May 30, 2025, EtaLabs, a Colorado corporati...",NDA
2,"Effective June 22, 2025, DeltaInc, a Colorado ...",Vendor
3,"Effective January 15, 2025, EpsilonTech, a New...",SLA
4,"Effective June 24, 2025, IotaLabs, a Texas cor...",SLA
5,"Effective November 06, 2025, DeltaPartners, a ...",Vendor


# **Data Prep**

Here, the goal is to target preparing text so its more suitable for training on the model. Case standardization, Removing characters, and white spaces, and after that I'll go into vctoriation of the text before training and testing the model

In [None]:
import re

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Clean_content'] = df['Content'].astype(str).apply(clean_text)

In [None]:
df=df[['Content', 'Clean_content', 'Category']]
df.head()

Unnamed: 0,Content,Clean_content,Category
0,"Effective June 26, 2025, EpsilonInc, a Califor...",effective june epsiloninc a california corpora...,Vendor
1,"On May 30, 2025, EtaLabs, a Colorado corporati...",on may etalabs a colorado corporation at marke...,NDA
2,"Effective June 22, 2025, DeltaInc, a Colorado ...",effective june deltainc a colorado corporation...,Vendor
3,"Effective January 15, 2025, EpsilonTech, a New...",effective january epsilontech a new york corpo...,SLA
4,"Effective June 24, 2025, IotaLabs, a Texas cor...",effective june iotalabs a texas corporation at...,SLA


# **Train-test Split and Model Training**

First, I begin with the Vectorization of the Content of the documents. Ideally, would like to explore text vectorization or word embedding options, I started with Count Vectorizer, would include Word2vec and then consider the Term-freqency Inverse document frequency as well.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english"
)

X = vectorizer.fit_transform(df['Clean_content'])
y = df['Category']   # your target labels: NDA, SLA, Employment, Vendor, Partnership

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

In [None]:
#import numpy as np
#y_shuffled = np.random.permutation(y_train)

In [None]:
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)
#lr_model.fit(X_train, y_shuffled)

In [None]:
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Employment       1.00      1.00      1.00        30
         NDA       1.00      1.00      1.00        30
 Partnership       1.00      1.00      1.00        30
         SLA       1.00      1.00      1.00        30
      Vendor       1.00      1.00      1.00        30

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



In [None]:
print(confusion_matrix(y_test, y_pred))

[[30  0  0  0  0]
 [ 0 30  0  0  0]
 [ 0  0 30  0  0]
 [ 0  0  0 30  0]
 [ 0  0  0  0 30]]


# **Shuffling + TF-IDF**

**Testing Word2vec for Vectorization**

In [None]:
!pip install gensim
#import gensim.downloader as api

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
w2v_model = api.load("word2vec-google-news-300")
glove_model = api.load("glove-wiki-gigaword-300")

In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

In [None]:
def document_vector(text, model):
    tokens = word_tokenize(str(text).lower())
    tokens = [t for t in tokens if t in model]
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model[tokens], axis=0)

In [None]:
X_pretrained = np.array([document_vector(text, w2v_model) for text in df['Content']])
y = df['Label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X_pretrained, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## **Exploring Support Vector Machine for classification**