### Installing Libraries

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
import ssl
import os
from collections import Counter
from ollama import chat
from scipy.stats import pearsonr
import re
import logging
from sentence_transformers import SentenceTransformer, util
from scipy.stats import pearsonr
from transformers import RobertaModel, RobertaTokenizer
import torch
import logging
from ollama import Client
import time

nltk.download('stopwords')
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

## Part 1

### Opening the Legal Files

In [None]:
folder_path = "CUAD_v1/full_contract_txt"
corpus = ""

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            corpus += file.read().lower() + " "


### Tokenizing the corpus and saving into output.txt

In [None]:
treebank_tokenizer = TreebankWordTokenizer()

tokens = treebank_tokenizer.tokenize(corpus, convert_parentheses=True)

with open('output.txt', 'w', encoding='utf-8') as file:
    for token in tokens:
        file.write(f"{token}\n")

### Counting the total and unique number of tokens in corpus

In [None]:
token_counts = Counter(tokens)

sorted_tokens = token_counts.most_common()

In [None]:
print("Number of tokens in corpus:", len(tokens))
print("Number of unique tokens in corpus:", len(sorted_tokens))

### Getting the initial type token ratio

In [None]:
type_token_ratio = len(sorted_tokens)/len(tokens)

print("Type Token Ratio:", type_token_ratio)

### Saving the unique tokens and it's freqeuncy in tokens.txt

In [None]:
single_tokens = 0

with open('tokens.txt', 'w') as file:
    for token, freq in sorted_tokens:
        if freq == 1:
            single_tokens+=1
        file.write(f"{token}: {freq}\n")

### Outputting the number of tokens that appear only once

In [None]:
print("Number of tokens that appear only once:", single_tokens)

### Removing all parenthesis tokens

In [None]:
function_tokens = {'-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-'}
filtered_tokens = [token for token in tokens if token not in function_tokens]

### Removing all punctuation tokens

In [None]:
regexp_tokenizer = RegexpTokenizer(r'\w+')
punctuation_tokens = regexp_tokenizer.tokenize(" ".join(filtered_tokens))

# Print results
# print(punctuation_tokens)
print("Original Tokens (Treebank):", len(filtered_tokens))
print("Final len(tokens) (no punctuation):", len(punctuation_tokens))

In [None]:
with open('output_e.txt', 'w') as file:
    for token in punctuation_tokens:
        file.write(f"{token}\n")

In [None]:
punctuation_token_counts = Counter(punctuation_tokens)

punctuation_sorted_tokens = punctuation_token_counts.most_common()

with open('tokens_e.txt', 'w') as file:
    for token, freq in punctuation_sorted_tokens:
        file.write(f"{token}: {freq}\n")

### Finding the total and unique number of tokens without punctuation

In [None]:
print("Number of tokens in corpus without punctuation:", len(punctuation_tokens))
print("Number of unique tokens in corpus without punctuation:", len(punctuation_sorted_tokens))

punctuation_type_token_ratio = len(punctuation_sorted_tokens)/len(punctuation_tokens)

print("Type Token Ratio without punctuation:", punctuation_type_token_ratio)

### Removing all Stop words using the nltk stop words library

In [None]:
stop_words = set(stopwords.words('english'))
no_stop_tokens = [word for word in punctuation_tokens if word.lower() not in stop_words]
# print(no_stop_tokens)

In [None]:
with open('output_f.txt', 'w') as file:
    for token in no_stop_tokens:
        file.write(f"{token}\n")

In [None]:
no_stop_token_counts = Counter(no_stop_tokens)

no_stop_sorted_tokens = no_stop_token_counts.most_common()

with open('tokens_f.txt', 'w') as file:
    for token, freq in no_stop_sorted_tokens:
        file.write(f"{token}: {freq}\n")

### Finding the total and unique number of tokens without punctuation and stopwords

In [None]:
print("Number of tokens in corpus without punctuation and stop words:", len(no_stop_tokens))
print("Number of unique tokens in corpus without punctuation and stop words:", len(no_stop_sorted_tokens))

no_stop_type_token_ratio = len(no_stop_sorted_tokens)/len(no_stop_tokens)

print("Type Token Ratio without punctuation and stop words:", no_stop_type_token_ratio)

### Generating Bigrams

In [None]:
# Generate bigrams
bigrams = list(nltk.bigrams(punctuation_tokens))

pure_bigrams = []

for words in bigrams:
    if words[0] not in stop_words and words[1] not in stop_words:
        pure_bigrams.append(words)

pure_bigram_counts = Counter(pure_bigrams)

pure_bigram_counts_sorted= pure_bigram_counts.most_common()

with open('tokens_g.txt','w') as file:
    for token, freq in pure_bigram_counts_sorted:
        file.write(f"{token}: {freq}\n")

# print("Filtered tokens:", no_stop_tokens)
# print("Bigrams:", bigrams)

### Finding the total and unique number of bigrams

## 

In [None]:
print(len(pure_bigrams))
print(len(pure_bigram_counts))
print("Type to Token ratio:", len(pure_bigram_counts)/len(pure_bigrams))

## Part 2

### Opening the input and gold files

In [51]:
# Opening the question-question corpus
with open("./sts2016/STS2016.input.question-question.txt", "r") as infile:
    question_question_input_lines = infile.readlines()

with open("./sts2016/STS2016.gs.question-question.txt", "r") as gsfile:
    question_question_gs_lines = gsfile.readlines()


# Opening the post-editing corpus
with open("./sts2016/STS2016.input.postediting.txt", "r") as infile:
    postediting_input_lines = infile.readlines()

with open("./sts2016/STS2016.gs.postediting.txt", "r") as gsfile:
    postediting_gs_lines = gsfile.readlines()


# Opening the plagiarism corpus
with open("./sts2016/STS2016.input.plagiarism.txt", "r") as infile:
    plagiarism_input_lines = infile.readlines()

with open("./sts2016/STS2016.gs.plagiarism.txt", "r") as gsfile:
    plagiarism_gs_lines = gsfile.readlines()


# Opening the headlines corpus
with open("./sts2016/STS2016.input.headlines.txt", "r") as infile:
    headlines_input_lines = infile.readlines()

with open("./sts2016/STS2016.gs.headlines.txt", "r") as gsfile:
    headlines_gs_lines = gsfile.readlines()


# Opening the answer-answer corpus
with open("./sts2016/STS2016.input.answer-answer.txt", "r") as infile:
    answer_answer_input_lines = infile.readlines()

with open("./sts2016/STS2016.gs.answer-answer.txt", "r") as gsfile:
    answer_answer_gs_lines = gsfile.readlines()

### Saving the valid pairs from the corpus

In [52]:
valid_pairs = []
gold_scores = []

# question-question
for line, gold in zip(question_question_input_lines, question_question_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# post-editing
for line, gold in zip(postediting_input_lines, postediting_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# plagiarism
for line, gold in zip(plagiarism_input_lines, plagiarism_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# headlines
for line, gold in zip(headlines_input_lines, headlines_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# answer-answer
for line, gold in zip(answer_answer_input_lines, answer_answer_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

### Running RoBERTa on the Corpus

In [None]:
# Load RoBERTa model and tokenizer (from Hugging Face)
roberta_model_name = "roberta-base"
roberta_model = RobertaModel.from_pretrained(roberta_model_name)
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)

# Disable all logging
logging.basicConfig(level=logging.CRITICAL)


# Function to calculate similarity using RoBERTa
def get_similarity_roberta(s1, s2):
    """Calculates semantic similarity between two sentences using RoBERTa embeddings."""
    # Tokenize sentences
    tokens1 = roberta_tokenizer(s1, return_tensors='pt', truncation=True, padding=True)
    tokens2 = roberta_tokenizer(s2, return_tensors='pt', truncation=True, padding=True)
    
    # Get RoBERTa embeddings
    with torch.no_grad():
        embeddings1 = roberta_model(**tokens1).last_hidden_state.mean(dim=1)
        embeddings2 = roberta_model(**tokens2).last_hidden_state.mean(dim=1)
    
    # Compute cosine similarity between the sentence embeddings
    cosine_similarity = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)
    return cosine_similarity.item()

# Process each sentence pair individually
predicted_scores_roberta = []

for i, pair in enumerate(valid_pairs, 1):
    try:
        s1, s2 = pair.split("\t")[:2]  # Split pair into sentences

        # Get similarity from RoBERTa
        score_roberta = get_similarity_roberta(s1, s2)
        predicted_scores_roberta.append(score_roberta)

        # Print progress
        print(f"{i}/{len(valid_pairs)} - {s1} : {s2} --> RoBERTa Score: {score_roberta}")

    except Exception as e:
        logging.error(f"Error processing pair {i}: {pair} - {e}")
        predicted_scores_roberta.append(None)

# Print results summary
print("RoBERTa Scores:", predicted_scores_roberta[:10])  # Show first 10 for verification


### Calculating the Pearson Score

In [None]:
# Now you can compare the predicted scores to gold scores

correlation_roberta, _ = pearsonr(predicted_scores_roberta, gold_scores)

print(f"Pearson correlation (RoBERTa): {correlation_roberta:.4f}")

### Running SBERT on the Corpus

In [None]:
# Load SBERT model (using a pre-trained model)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Disable all logging
logging.basicConfig(level=logging.CRITICAL)

# Function to calculate similarity using SBERT
def get_similarity_sbert(s1, s2):
    """Calculates semantic similarity between two sentences using SBERT embeddings."""
    embeddings1 = sbert_model.encode(s1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(s2, convert_to_tensor=True)

    # Compute cosine similarity between the sentence embeddings
    similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return similarity.item()

# Process each sentence pair individually
predicted_scores_sbert = []

for i, pair in enumerate(valid_pairs, 1):
    try:
        s1, s2 = pair.split("\t")[:2]  # Split pair into sentences

        # Get similarity from SBERT
        score_sbert = get_similarity_sbert(s1, s2)
        predicted_scores_sbert.append(score_sbert)

        # Print progress
        print(f"{i}/{len(valid_pairs)} - {s1} : {s2} --> SBERT Score: {score_sbert}")

    except Exception as e:
        logging.error(f"Error processing pair {i}: {pair} - {e}")
        predicted_scores_sbert.append(None)

# Print results summary
print(f"\nProcessed {len(predicted_scores_sbert)} pairs.")
print("SBERT Scores:", predicted_scores_sbert[:10])  # Show first 10 for verification

### Calculating the Pearson Score

In [None]:
# Now you can compare the predicted scores to gold scores

correlation_sbert, _ = pearsonr(predicted_scores_sbert, gold_scores)

print(f"Pearson correlation (SBERT): {correlation_sbert:.4f}")

### Running LLama 3.2 on the corpus

In [55]:
import logging
import time
import numpy as np
import ollama
from scipy.spatial.distance import cosine

# Disable all logging
logging.getLogger().addHandler(logging.NullHandler())

# Function to get embeddings from Ollama for a given sentence
def text_to_embedding_ollama(text, model_name="llama3.2"):
    """Converts a sentence to an embedding using Ollama."""
    response = ollama.embed(model=model_name, input=text)
    embedding = np.array(response["embeddings"], dtype=np.float32).flatten()
    return embedding

# Function to normalize embeddings to unit length
def normalize_embedding(embedding):
    """Normalizes the embedding to unit length."""
    norm = np.linalg.norm(embedding)
    if norm == 0:
        return embedding
    return embedding / norm

# Function to get similarity score between two sentences using cosine similarity
def get_similarity(s1, s2, model_name="llama3.2", max_retries=5, score_type="float"):
    """Gets similarity score between two sentences by converting them to embeddings and using cosine similarity."""
    retries = 0
    while retries < max_retries:
        try:
            # Convert sentences to embeddings
            embedding1 = text_to_embedding_ollama(s1, model_name)
            embedding2 = text_to_embedding_ollama(s2, model_name)

            # Normalize embeddings
            embedding1 = normalize_embedding(embedding1)
            embedding2 = normalize_embedding(embedding2)

            # Calculate cosine similarity (1 - cosine distance)
            cosine_sim = 1 - cosine(embedding1, embedding2)

            # Scale similarity to range [0, 5]
            similarity_score = cosine_sim * 5

            # Ensure score is within valid bounds [0, 5]
            similarity_score = max(0, min(5, similarity_score))

            # Round to nearest integer if score_type is "int"
            if score_type == "int":
                similarity_score = round(similarity_score)

            return similarity_score

        except Exception as e:
            logging.error(f"Unexpected error: {e}. Retrying...")
            retries += 1
            time.sleep(2)

    # If maximum retries are reached, return 0
    logging.error(f"Max retries reached for pair: '{s1}' : '{s2}'")
    return 0

# Process pairs in increments of 100
predicted_scores = []
increment = 100
total_pairs = len(valid_pairs)

for start in range(0, total_pairs, increment):
    end = min(start + increment, total_pairs)
    
    print(f"Processing pairs {start+1} to {end}...")

    # Process the next batch of sentence pairs
    for i in range(start, end):
        try:
            s1, s2 = valid_pairs[i].split("\t")[:2]  # Split pair into sentences
            score = get_similarity(s1, s2, score_type="int")  # Use integer scores
            predicted_scores.append(score)

            # Print progress
            print(f"{i+1}/{total_pairs} - {s1} : {s2} --> Score: {score}")

        except Exception as e:
            logging.error(f"Error processing pair {i+1}: {valid_pairs[i]} - {e}")
            predicted_scores.append(None)

    # Optional: Pause for a while between batches to avoid rate limits
    time.sleep(10)  # Adjust the pause duration as needed

# Print results summary
print(f"\nProcessed {len(predicted_scores)} pairs.")
print("Predicted Scores:", predicted_scores[:10])  # Show first 10 for verification


Processing pairs 1 to 100...
1/1186 - How do I make a height adjustable desk? : How can I build a wall mounted adjustable height desk? --> Score: 5
2/1186 - How can I find out why my washing machine trips the outlet? : How can I figure out why my washing machine is tripping the GFCI receptacle? --> Score: 4
3/1186 - Should I use IRA money to pay down my student loans? : Should I cash out my IRA to pay my student loans? --> Score: 4
4/1186 - What is the best way to repair a cracked bathtub? : What is the best way to fix this garage floor? --> Score: 3
5/1186 - What's the best way to store asparagus? : What's the best way to store unused sushi rice? --> Score: 3
6/1186 - What are the bus (coach) connections from Thessaloniki, Greece to Tbilisi, Georgia? : Is there a bus from Tbilisi, Georgia to Thessaloniki, Greece? --> Score: 5
7/1186 - How can I replace a ceiling fan light pull chain? : How can I replace my light fixture with a ceiling fan? --> Score: 3
8/1186 - How can I connect addit

### Calculating the Pearson score

In [56]:
# Print the indexes where either predicted_scores or gold_scores is None
for i, (predicted, gold) in enumerate(zip(predicted_scores, gold_scores)):
    if predicted is None or gold is None:
        predicted_scores[i] = 0
        print(f"Index {i} - Predicted: {predicted}, Gold: {gold}")

In [57]:
print(len(predicted_scores), type(predicted_scores))
print(len(gold_scores), type(gold_scores))
correlation, _ = pearsonr(predicted_scores, gold_scores)
print(f"Pearson correlation: {correlation:.4f}")

1186 <class 'list'>
1186 <class 'list'>
Pearson correlation: 0.5154


### Running DEEPSEEK 1.5B Parameters

In [59]:
import logging
import time
import numpy as np
import ollama
from scipy.spatial.distance import cosine

# Disable all logging
logging.getLogger().addHandler(logging.NullHandler())

# Function to get embeddings from Ollama for a given sentence
def text_to_embedding_ollama(text, model_name="deepseek-r1:1.5b"):
    """Converts a sentence to an embedding using Ollama."""
    response = ollama.embed(model=model_name, input=text)
    embedding = np.array(response["embeddings"], dtype=np.float32).flatten()
    return embedding

# Function to normalize embeddings to unit length
def normalize_embedding(embedding):
    """Normalizes the embedding to unit length."""
    norm = np.linalg.norm(embedding)
    if norm == 0:
        return embedding
    return embedding / norm

# Function to get similarity score between two sentences using cosine similarity
def get_similarity(s1, s2, model_name="deepseek-r1:1.5b", max_retries=5, score_type="float"):
    """Gets similarity score between two sentences by converting them to embeddings and using cosine similarity."""
    retries = 0
    while retries < max_retries:
        try:
            # Convert sentences to embeddings
            embedding1 = text_to_embedding_ollama(s1, model_name)
            embedding2 = text_to_embedding_ollama(s2, model_name)

            # Normalize embeddings
            embedding1 = normalize_embedding(embedding1)
            embedding2 = normalize_embedding(embedding2)

            # Calculate cosine similarity (1 - cosine distance)
            cosine_sim = 1 - cosine(embedding1, embedding2)

            # Scale similarity to range [0, 5]
            similarity_score = cosine_sim * 5

            # Ensure score is within valid bounds [0, 5]
            similarity_score = max(0, min(5, similarity_score))

            # Round to nearest integer if score_type is "int"
            if score_type == "int":
                similarity_score = round(similarity_score)

            return similarity_score

        except Exception as e:
            logging.error(f"Unexpected error: {e}. Retrying...")
            retries += 1
            time.sleep(2)

    # If maximum retries are reached, return 0
    logging.error(f"Max retries reached for pair: '{s1}' : '{s2}'")
    return 0

# Process pairs in increments of 100
predicted_scores = []
increment = 100
total_pairs = len(valid_pairs)

for start in range(0, total_pairs, increment):
    end = min(start + increment, total_pairs)
    
    print(f"Processing pairs {start+1} to {end}...")

    # Process the next batch of sentence pairs
    for i in range(start, end):
        try:
            s1, s2 = valid_pairs[i].split("\t")[:2]  # Split pair into sentences
            score = get_similarity(s1, s2, score_type="int")  # Use integer scores
            predicted_scores.append(score)

            # Print progress
            print(f"{i+1}/{total_pairs} - {s1} : {s2} --> Score: {score}")

        except Exception as e:
            logging.error(f"Error processing pair {i+1}: {valid_pairs[i]} - {e}")
            predicted_scores.append(None)

    # Optional: Pause for a while between batches to avoid rate limits
    time.sleep(10)  # Adjust the pause duration as needed

# Print results summary
print(f"\nProcessed {len(predicted_scores)} pairs.")
print("Predicted Scores:", predicted_scores[:10])  # Show first 10 for verification


Processing pairs 1 to 100...
1/1186 - How do I make a height adjustable desk? : How can I build a wall mounted adjustable height desk? --> Score: 5
2/1186 - How can I find out why my washing machine trips the outlet? : How can I figure out why my washing machine is tripping the GFCI receptacle? --> Score: 5
3/1186 - Should I use IRA money to pay down my student loans? : Should I cash out my IRA to pay my student loans? --> Score: 5
4/1186 - What is the best way to repair a cracked bathtub? : What is the best way to fix this garage floor? --> Score: 5
5/1186 - What's the best way to store asparagus? : What's the best way to store unused sushi rice? --> Score: 5
6/1186 - What are the bus (coach) connections from Thessaloniki, Greece to Tbilisi, Georgia? : Is there a bus from Tbilisi, Georgia to Thessaloniki, Greece? --> Score: 5
7/1186 - How can I replace a ceiling fan light pull chain? : How can I replace my light fixture with a ceiling fan? --> Score: 5
8/1186 - How can I connect addit

### Calculating the Pearson score

In [60]:
print(len(predicted_scores))
print(len(gold_scores))
correlation, _ = pearsonr(predicted_scores, gold_scores)
print(f"Pearson correlation: {correlation:.4f}")

1186
1186
Pearson correlation: 0.2058


### Running Mistrallite 7B Parameters

In [None]:
import logging
import tensorflow as tf
import tensorflow_hub as hub
from scipy.spatial.distance import cosine

# Load Universal Sentence Encoder model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Function to calculate similarity using Universal Sentence Encoder
def get_similarity_use(s1, s2):
    """Calculates semantic similarity between two sentences using the Universal Sentence Encoder."""
    # Encode sentences into embeddings
    embeddings1 = use_model([s1]).numpy().flatten()
    embeddings2 = use_model([s2]).numpy().flatten()

    
    # Compute cosine similarity between the sentence embeddings
    cosine_sim = 1 - cosine(embeddings1, embeddings2)  # Cosine similarity is 1 - cosine distance
    
    similarity_score = cosine_sim*5
    
    similarity_score = max(1, min(5, similarity_score))
    
    
    similarity_score = round(similarity_score)  # Round to integer
    
    return similarity_score

# Process each sentence pair individually
predicted_scores_use = []

for i, pair in enumerate(valid_pairs, 1):
    try:
        s1, s2 = pair.split("\t")[:2]  # Split pair into sentences

        # Get similarity from Universal Sentence Encoder
        score_use = get_similarity_use(s1, s2)
        predicted_scores_use.append(score_use)

        # Print progress
        print(f"{i}/{len(valid_pairs)} - {s1} : {s2} --> USE Score: {score_use}")

    except Exception as e:
        print(f"Error processing pair {i}: {pair} - {e}")
        predicted_scores_use.append(None)

# Print results summary
print(f"\nProcessed {len(predicted_scores_use)} pairs.")
print("USE Scores:", predicted_scores_use[:10])  # Show first 10 for verification


1/1186 - How do I make a height adjustable desk? : How can I build a wall mounted adjustable height desk? --> USE Score: 2
2/1186 - How can I find out why my washing machine trips the outlet? : How can I figure out why my washing machine is tripping the GFCI receptacle? --> USE Score: 2
3/1186 - Should I use IRA money to pay down my student loans? : Should I cash out my IRA to pay my student loans? --> USE Score: 1
4/1186 - What is the best way to repair a cracked bathtub? : What is the best way to fix this garage floor? --> USE Score: 3
5/1186 - What's the best way to store asparagus? : What's the best way to store unused sushi rice? --> USE Score: 3
6/1186 - What are the bus (coach) connections from Thessaloniki, Greece to Tbilisi, Georgia? : Is there a bus from Tbilisi, Georgia to Thessaloniki, Greece? --> USE Score: 2
7/1186 - How can I replace a ceiling fan light pull chain? : How can I replace my light fixture with a ceiling fan? --> USE Score: 2
8/1186 - How can I connect additi

### Calculating the Pearson score

In [69]:
# Now you can compare the predicted scores to gold scores (if applicable)
from scipy.stats import pearsonr

correlation_use, _ = pearsonr(predicted_scores_use, gold_scores)
print(f"Pearson correlation (USE): {correlation_use:.4f}")

Pearson correlation (USE): -0.7213
