### Installing Libraries

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
import ssl
import os
from collections import Counter
from ollama import chat
from scipy.stats import pearsonr
import re
nltk.download('stopwords')
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

## Part 1

### Opening the Legal Files

In [4]:
folder_path = "CUAD_v1/full_contract_txt"
corpus = ""

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            corpus += file.read().lower() + " "


### Tokenizing the corpus and saving into output.txt

In [6]:
treebank_tokenizer = TreebankWordTokenizer()

tokens = treebank_tokenizer.tokenize(corpus, convert_parentheses=True)

with open('output.txt', 'w') as file:
    for token in tokens:
        file.write(f"{token}\n")

### Counting the total and unique number of tokens in corpus

In [7]:
token_counts = Counter(tokens)

sorted_tokens = token_counts.most_common()

In [None]:
print("Number of tokens in corpus:", len(tokens))
print("Number of unique tokens in corpus:", len(sorted_tokens))

### Getting the initial type token ratio

In [None]:
type_token_ratio = len(sorted_tokens)/len(tokens)

print("Type Token Ratio:", type_token_ratio)

### Saving the unique tokens and it's freqeuncy in tokens.txt

In [10]:
single_tokens = 0

with open('tokens.txt', 'w') as file:
    for token, freq in sorted_tokens:
        if freq == 1:
            single_tokens+=1
        file.write(f"{token}: {freq}\n")

### Outputting the number of tokens that appear only once

In [None]:
print("Number of tokens that appear only once:", single_tokens)

### Removing all parenthesis tokens

In [12]:
function_tokens = {'-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-'}
filtered_tokens = [token for token in tokens if token not in function_tokens]

### Removing all punctuation tokens

In [None]:
regexp_tokenizer = RegexpTokenizer(r'\w+')
punctuation_tokens = regexp_tokenizer.tokenize(" ".join(filtered_tokens))

# Print results
# print(punctuation_tokens)
print("Original Tokens (Treebank):", len(filtered_tokens))
print("Final len(tokens) (no punctuation):", len(punctuation_tokens))

In [14]:
with open('output_e.txt', 'w') as file:
    for token in punctuation_tokens:
        file.write(f"{token}\n")

In [15]:
punctuation_token_counts = Counter(punctuation_tokens)

punctuation_sorted_tokens = punctuation_token_counts.most_common()

with open('tokens_e.txt', 'w') as file:
    for token, freq in punctuation_sorted_tokens:
        file.write(f"{token}: {freq}\n")

### Finding the total and unique number of tokens without punctuation

In [None]:
print("Number of tokens in corpus without punctuation:", len(punctuation_tokens))
print("Number of unique tokens in corpus without punctuation:", len(punctuation_sorted_tokens))

punctuation_type_token_ratio = len(punctuation_sorted_tokens)/len(punctuation_tokens)

print("Type Token Ratio without punctuation:", punctuation_type_token_ratio)

### Removing all Stop words using the nltk stop words library

In [17]:
stop_words = set(stopwords.words('english'))
no_stop_tokens = [word for word in punctuation_tokens if word.lower() not in stop_words]
# print(no_stop_tokens)

In [18]:
with open('output_f.txt', 'w') as file:
    for token in no_stop_tokens:
        file.write(f"{token}\n")

In [19]:
no_stop_token_counts = Counter(no_stop_tokens)

no_stop_sorted_tokens = no_stop_token_counts.most_common()

with open('tokens_f.txt', 'w') as file:
    for token, freq in no_stop_sorted_tokens:
        file.write(f"{token}: {freq}\n")

### Finding the total and unique number of tokens without punctuation and stopwords

In [None]:
print("Number of tokens in corpus without punctuation and stop words:", len(no_stop_tokens))
print("Number of unique tokens in corpus without punctuation and stop words:", len(no_stop_sorted_tokens))

no_stop_type_token_ratio = len(no_stop_sorted_tokens)/len(no_stop_tokens)

print("Type Token Ratio without punctuation and stop words:", no_stop_type_token_ratio)

### Generating Bigrams

In [None]:
# Generate bigrams
bigrams = list(nltk.bigrams(punctuation_tokens))

pure_bigrams = []

for words in bigrams:
    if words[0] not in stop_words and words[1] not in stop_words:
        pure_bigrams.append(words)

pure_bigram_counts = Counter(pure_bigrams)

pure_bigram_counts_sorted= pure_bigram_counts.most_common()

with open('tokens_g.txt','w') as file:
    for token, freq in pure_bigram_counts_sorted:
        file.write(f"{token}: {freq}\n")

# print("Filtered tokens:", no_stop_tokens)
# print("Bigrams:", bigrams)

### Finding the total and unique number of bigrams

## 

In [None]:
print(len(pure_bigrams))
print(len(pure_bigram_counts))
print("Type to Token ratio:", len(pure_bigram_counts)/len(pure_bigrams))

## Part 2

### Opening the input and gold files

In [37]:
# Opening the question-question corpus
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.question-question.txt", "r") as infile:
    question_question_input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.question-question.txt", "r") as gsfile:
    question_question_gs_lines = gsfile.readlines()


# Opening the post-editing corpus
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.postediting.txt", "r") as infile:
    postediting_input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.postediting.txt", "r") as gsfile:
    postediting_gs_lines = gsfile.readlines()


# Opening the plagiarism corpus
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.plagiarism.txt", "r") as infile:
    plagiarism_input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.plagiarism.txt", "r") as gsfile:
    plagiarism_gs_lines = gsfile.readlines()


# Opening the headlines corpus
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.headlines.txt", "r") as infile:
    headlines_input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.headlines.txt", "r") as gsfile:
    headlines_gs_lines = gsfile.readlines()


# Opening the answer-answer corpus
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.answer-answer.txt", "r") as infile:
    answer_answer_input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.answer-answer.txt", "r") as gsfile:
    answer_answer_gs_lines = gsfile.readlines()

### Saving the valid pairs from the corpus

In [None]:
valid_pairs = []
gold_scores = []

# question-question
for line, gold in zip(question_question_input_lines, question_question_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# post-editing
for line, gold in zip(postediting_input_lines, postediting_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# plagiarism
for line, gold in zip(plagiarism_input_lines, plagiarism_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# headlines
for line, gold in zip(headlines_input_lines, headlines_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

# answer-answer
for line, gold in zip(answer_answer_input_lines, answer_answer_gs_lines):
    gold = gold.strip()
    if gold:  
        valid_pairs.append(line.strip())  
        gold_scores.append(float(gold))  

In [None]:

# Align input with gold scores
valid_pairs = []
gold_scores = []
for line, gold in zip(input_lines, gs_lines):
    gold = gold.strip()
    if gold:  # Skip pairs without gold scores
        valid_pairs.append(line.strip())  # Add sentence pair
        gold_scores.append(float(gold))  # Add gold score

# Function to query Ollama for similarity score
def get_similarity(s1, s2):
    # Construct query for the sentence pair
    query = f"""Rate the semantic similarity between the following two sentences on a scale from 0 (completely unrelated) to 5 (semantically equivalent). Just give the number. 
    Sentence 1: {s1}
    Sentence 2: {s2}"""
    
    response = chat(model="llama3.2", messages=[{"role": "user", "content": query}])
    
    try:
        # Parse the response into a score
        score = float(response.message.content.strip())
        return score
    except ValueError:
        print(f"Error processing response: {response.message.content}")
        return None

# Process each sentence pair individually
predicted_scores = []

for pair in valid_pairs:
    s1, s2 = pair.split("\t")[:2]  # Split pair into sentences
    score = get_similarity(s1, s2)  # Get similarity score for the pair
    predicted_scores.append(score)  # Add the result to the list
    print(s1, ":", s2)
    print(score)

# Print results
print(f"Processed {len(predicted_scores)} pairs.")
print("Gold Scores:", gold_scores[:10])  # Show first 10 for verification
print("Predicted Scores:", predicted_scores[:10])  # Show first 10 for verification


In [None]:
# Load input and gold files
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.question-question.txt", "r") as infile:
    input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.question-question.txt", "r") as gsfile:
    gs_lines = gsfile.readlines()

# Align input with gold scores
valid_pairs = []
gold_scores = []
for line, gold in zip(input_lines, gs_lines):
    gold = gold.strip()
    if gold:  # Skip pairs without gold scores
        valid_pairs.append(line.strip())  # Add sentence pair
        gold_scores.append(float(gold))  # Add gold score

# Function to query Ollama for similarity score
def get_similarity(s1, s2):
    # Construct query for the sentence pair
    query = f"""Rate the semantic similarity between the following two sentences on a scale from 0 (completely unrelated) to 5 (semantically equivalent). Just give the number. 
    Sentence 1: {s1}
    Sentence 2: {s2}"""
    
    response = chat(model="deepseek-r1:1.5b", messages=[{"role": "user", "content": query}])
    
    try:
        # Parse the response into a score
        cleaned_response = re.sub(r'<think>.*?</think>', '', response.message.content.strip(),flags=re.DOTALL)
        number = re.search(r'\d+', cleaned_response)
        score = float(number.group())
        return score
    except ValueError:
        print(f"Error processing response: {response.message.content}")
        return None

# Process each sentence pair individually
predicted_scores = []

for pair in valid_pairs:
    s1, s2 = pair.split("\t")[:2]  # Split pair into sentences
    score = get_similarity(s1, s2)  # Get similarity score for the pair
    predicted_scores.append(score)  # Add the result to the list
    print(s1, ":", s2)
    print(score)

# Print results
print(f"Processed {len(predicted_scores)} pairs.")
print("Gold Scores:", gold_scores[:10])  # Show first 10 for verification
print("Predicted Scores:", predicted_scores[:10])  # Show first 10 for verification


How do I make a height adjustable desk? : How can I build a wall mounted adjustable height desk?
4.0
How can I find out why my washing machine trips the outlet? : How can I figure out why my washing machine is tripping the GFCI receptacle?
2.0
Should I use IRA money to pay down my student loans? : Should I cash out my IRA to pay my student loans?
4.0
What is the best way to repair a cracked bathtub? : What is the best way to fix this garage floor?
2.0
What's the best way to store asparagus? : What's the best way to store unused sushi rice?
3.0
What are the bus (coach) connections from Thessaloniki, Greece to Tbilisi, Georgia? : Is there a bus from Tbilisi, Georgia to Thessaloniki, Greece?
4.0
How can I replace a ceiling fan light pull chain? : How can I replace my light fixture with a ceiling fan?
4.0
How can I connect additional wires to a receptacle? : How do I connect the wires to this USB receptacle?
4.0
What kind of socket is this? : What type of faucet is this?
5.0
What is the di

In [None]:
correlation, _ = pearsonr(predicted_scores, gold_scores)
print(f"Pearson correlation: {correlation:.4f}")

deepseek

In [34]:
# Load input and gold files
with open("./sts2016-english-with-gs-v1.0 2/STS2016.input.question-question.txt", "r") as infile:
    input_lines = infile.readlines()

with open("./sts2016-english-with-gs-v1.0 2/STS2016.gs.question-question.txt", "r") as gsfile:
    gs_lines = gsfile.readlines()

# Align input with gold scores
valid_pairs = []
gold_scores = []
for line, gold in zip(input_lines, gs_lines):
    gold = gold.strip()
    if gold:  # Skip pairs without gold scores
        valid_pairs.append(line.strip())  # Add sentence pair
        gold_scores.append(float(gold))  # Add gold score

# Function to query Ollama for similarity score
def get_similarity(s1, s2):
    # Construct query for the sentence pair
    query = f"""Rate the semantic similarity between the following two sentences on a scale from 0 (completely unrelated) to 5 (semantically equivalent). Just give the number. 
    Sentence 1: {s1}
    Sentence 2: {s2}"""
    
    response = chat(model="deepseek-r1:1.5b", messages=[{"role": "user", "content": query}])
    
    try:
        # Parse the response into a score
        cleaned_response = re.sub(r'<think>.*?</think>', '', response.message.content.strip(),flags=re.DOTALL)
        number = re.search(r'\d+', cleaned_response)
        score = float(number.group())
        return score
    except ValueError:
        print(f"Error processing response: {response.message.content}")
        return None

# Process each sentence pair individually
predicted_scores = []

for pair in valid_pairs:
    s1, s2 = pair.split("\t")[:2]  # Split pair into sentences
    score = get_similarity(s1, s2)  # Get similarity score for the pair
    predicted_scores.append(score)  # Add the result to the list
    print(s1, ":", s2)
    print(score)

# Print results
print(f"Processed {len(predicted_scores)} pairs.")
print("Gold Scores:", gold_scores[:10])  # Show first 10 for verification
print("Predicted Scores:", predicted_scores[:10])  # Show first 10 for verification


How do I make a height adjustable desk? : How can I build a wall mounted adjustable height desk?
4.0
How can I find out why my washing machine trips the outlet? : How can I figure out why my washing machine is tripping the GFCI receptacle?
2.0
Should I use IRA money to pay down my student loans? : Should I cash out my IRA to pay my student loans?
4.0
What is the best way to repair a cracked bathtub? : What is the best way to fix this garage floor?
2.0
What's the best way to store asparagus? : What's the best way to store unused sushi rice?
3.0
What are the bus (coach) connections from Thessaloniki, Greece to Tbilisi, Georgia? : Is there a bus from Tbilisi, Georgia to Thessaloniki, Greece?
4.0
How can I replace a ceiling fan light pull chain? : How can I replace my light fixture with a ceiling fan?
4.0
How can I connect additional wires to a receptacle? : How do I connect the wires to this USB receptacle?
4.0
What kind of socket is this? : What type of faucet is this?
5.0
What is the di

In [36]:
print(len(predicted_scores))
print(len(gold_scores))
correlation, _ = pearsonr(predicted_scores, gold_scores)
print(f"Pearson correlation: {correlation:.4f}")

209
209
Pearson correlation: 0.3403
