In [2]:
import os, json, re
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import openai
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer

load_dotenv()

True

In [3]:
common_words_path=r"C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\common_words.txt"

In [4]:
def extract_common_words(common_words_path: str) -> list:
    with open(common_words_path, 'r') as f:
        return [line.strip().lower() for line in f if line.strip()]

In [5]:
def flatten_description(desc) -> str:
    if isinstance(desc, str):
        return desc
    elif isinstance(desc, list):
        return ' '.join(flatten_description(item) for item in desc)
    elif isinstance(desc, dict):
        return ' '.join(f"{k} {flatten_description(v)}" for k, v in desc.items())
    else:
        return str(desc)


In [6]:
def clean_text(text) -> str:
    text = flatten_description(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
def extract_common_words(common_words_path: str) -> list:
    with open(common_words_path, 'r') as f:
        return [line.strip().lower() for line in f if line.strip()]

In [8]:
def get_embedding(text: str, model: str = "text-embedding-3-small") -> list:
    response = openai.embeddings.create(model=model, input=text)
    return response.data[0].embedding

In [9]:
def compute_cosine_from_embeddings(e1, e2) -> float:
    return cosine_similarity([e1], [e2])[0][0]

In [10]:
def flatten(content):
        if isinstance(content, str):
            return content
        elif isinstance(content, list):
            return " ".join(flatten(item) for item in content)
        elif isinstance(content, dict):
            return " ".join(flatten(v) for v in content.values())
        return ""

## Code


In [11]:
def extract_sections_with_weekly_schedule(data):
    sections = {}

    for section in data:
        title = section.get("title")
        desc = section.get("description")
        if title == "Your Study Plan: Based on You" and isinstance(desc, list):
            for subsection in desc:
                sub_title = subsection.get("title")
                sub_desc = subsection.get("description")
                if sub_title == "Study Plan Phases" and isinstance(sub_desc, list):
                    for sub2_section in sub_desc:
                        sub2_title = sub2_section.get("title")
                        sub2_desc = sub2_section.get("description")
                        sections[sub2_title] = flatten(sub2_desc)
                else:
                    sections[sub_title] = flatten(sub_desc)
        else:
            sections[title] = flatten(desc)
    return sections

In [12]:
def compute_bleu_openai_cosine_scores(folder_path: str):
    files = [f for f in os.listdir(folder_path) if re.match(r'.*_\d+\.json$', f)]
    if len(files) < 2:
        raise ValueError("Need at least two `.json` files to compare.")

    all_data = {}
    for file in files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            json_data = json.load(f)

        extracted = extract_sections_with_weekly_schedule(json_data)
        cleaned = {k: clean_text(v) for k, v in extracted.items()}
        all_data[file] = cleaned

    all_sections = sorted({title for d in all_data.values() for title in d.keys()})
    smoothing = SmoothingFunction().method1
    rows = []

    for f1, f2 in combinations(files, 2):
        row = {'doc1': f1, 'doc2': f2}
        full1, full2 = [], []

        for section in all_sections:
            t1 = all_data[f1].get(section, "")
            t2 = all_data[f2].get(section, "")
            common_words = set(extract_common_words(common_words_path=common_words_path))
            t1 = " ".join([w for w in t1.split() if w not in common_words])
            t2 = " ".join([w for w in t2.split() if w not in common_words])

            print("\n\nSection:", section)
            print("Text1:", t1)
            print("Text2:", t2)

            full1.append(t1)
            full2.append(t2)

            bleu = sentence_bleu([t1.split()], t2.split(), smoothing_function=smoothing)
            row[f"{section}_BLEU"] = round(bleu, 4)

            e1 = get_embedding(t1)
            e2 = get_embedding(t2)
            cos = compute_cosine_from_embeddings(e1, e2)
            row[f"{section}_Cosine"] = round(cos, 4)

        joined1 = " ".join(full1)
        joined2 = " ".join(full2)

        bleu_full = sentence_bleu([joined1.split()], joined2.split(), smoothing_function=smoothing)
        e1_full = get_embedding(joined1)
        e2_full = get_embedding(joined2)
        cos_full = compute_cosine_from_embeddings(e1_full, e2_full)

        row["FULL_TEXT_BLEU"] = round(bleu_full, 4)
        row["FULL_TEXT_Cosine"] = round(cos_full, 4)

        rows.append(row)

    return pd.DataFrame(rows)


In [13]:
#### Single user example to test the code:

user = "Aarthi"
path = r"C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\Users_data\{user}".format(user=user)
df = pd.DataFrame()

path_user = path
if os.path.isdir(path_user):
    print(f"Processing user folder: {path_user}")
    try:
        results = compute_bleu_openai_cosine_scores(path_user)
        results['User'] = user  # Add a column to track which user's folder it was
        df = pd.concat([df, results], ignore_index=True)
    except Exception as e:
        print(f"Error processing {user}: {e}")

print(df)

Processing user folder: C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\Users_data\Aarthi


Section: Elevation
Text1: date july 1 2025 july 31 2025 reading and writing build fluency in information and ideas through timed practice and reflective summaries to enhance confidence and analytical skills math strengthen algebra skills with peer discussions or stepwise problem solving to refine accuracy and deepen conceptual understanding
Text2: date july 1 2025 july 31 2025 reading and writing strengthen information and ideas through reflective practice and timed drills that build confidence and fluency math enhance algebra skills with targeted refinement and spaced repetition to solidify accuracy and speed


Section: Footer
Text1: safe harbor the scores assessments and recommendations provided in this report are for informational purposes only and do not constitute professional educational psychological or legal advice current performance is not necessarily indicative of actual or futu

In [14]:
# Define output file path
output_path = os.path.join(path, 'bleu_cosine_results.csv')

# Delete the file if it already exists
if os.path.exists(output_path):
    os.remove(output_path)
    print(f"Deleted existing file: {output_path}")

# Save the new results to CSV
df.to_csv(output_path, index=False)
print(f"Saved results to {output_path}")

Deleted existing file: C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\Users_data\Aarthi\bleu_cosine_results.csv
Saved results to C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\Users_data\Aarthi\bleu_cosine_results.csv


In [15]:
import pandas as pd
from IPython.display import display

# Calculate the mean for each column (excluding non-numeric columns)
df_avg = df.select_dtypes(include='number').mean().to_frame().T

# Convert to percentages (multiply by 100)
df_avg_percent = df_avg * 100

# Beautiful print


pd.set_option('display.float_format', '{:,.2f}'.format)
display(df_avg_percent)

Unnamed: 0,Elevation_BLEU,Elevation_Cosine,Footer_BLEU,Footer_Cosine,Foundation_BLEU,Foundation_Cosine,Math Strategies_BLEU,Math Strategies_Cosine,Peak_BLEU,Peak_Cosine,...,Want to Go Further?_BLEU,Want to Go Further?_Cosine,What This Means For You_BLEU,What This Means For You_Cosine,Your Learning Persona_BLEU,Your Learning Persona_Cosine,Your SAT Practice Snapshot_BLEU,Your SAT Practice Snapshot_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
0,42.62,95.47,100.0,100.0,47.76,95.71,24.91,88.0,51.04,94.65,...,45.63,93.94,36.08,93.73,57.72,98.63,71.39,96.47,56.47,98.54


In [16]:
# Group by 'User' and calculate mean for each user
df_user_avg = df.groupby('User').mean(numeric_only=True)

# Convert to percentages
df_user_avg_percent = df_user_avg * 100

# Calculate overall average for each column
df_overall_avg_percent = df_user_avg_percent.mean().to_frame().T
df_overall_avg_percent.index = ['Overall Average']

# Display user-wise averages and overall average
pd.set_option('display.float_format', '{:,.2f}'.format)
display(df_user_avg_percent)
display(df_overall_avg_percent)

Unnamed: 0_level_0,Elevation_BLEU,Elevation_Cosine,Footer_BLEU,Footer_Cosine,Foundation_BLEU,Foundation_Cosine,Math Strategies_BLEU,Math Strategies_Cosine,Peak_BLEU,Peak_Cosine,...,Want to Go Further?_BLEU,Want to Go Further?_Cosine,What This Means For You_BLEU,What This Means For You_Cosine,Your Learning Persona_BLEU,Your Learning Persona_Cosine,Your SAT Practice Snapshot_BLEU,Your SAT Practice Snapshot_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aarthi,42.62,95.47,100.0,100.0,47.76,95.71,24.91,88.0,51.04,94.65,...,45.62,93.94,36.08,93.73,57.72,98.63,71.39,96.46,56.47,98.54


Unnamed: 0,Elevation_BLEU,Elevation_Cosine,Footer_BLEU,Footer_Cosine,Foundation_BLEU,Foundation_Cosine,Math Strategies_BLEU,Math Strategies_Cosine,Peak_BLEU,Peak_Cosine,...,Want to Go Further?_BLEU,Want to Go Further?_Cosine,What This Means For You_BLEU,What This Means For You_Cosine,Your Learning Persona_BLEU,Your Learning Persona_Cosine,Your SAT Practice Snapshot_BLEU,Your SAT Practice Snapshot_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
Overall Average,42.62,95.47,100.0,100.0,47.76,95.71,24.91,88.0,51.04,94.65,...,45.62,93.94,36.08,93.73,57.72,98.63,71.39,96.46,56.47,98.54


# Cosine Similarity checker


In [17]:
text1 = "Aarthi, as a Persistent Hiker, you bring a calm, balanced approach to your SAT preparation that values steady progress and emotional well-being. Your thoughtful and deliberate mindset helps you maintain harmony in your studies, allowing you to stay resilient even when challenges arise. You naturally prioritize mental health and sustainable growth, which is a powerful strength in managing long-term goals. At times, you might find yourself avoiding intense academic pressure or difficult topics, preferring comfort over stretch, but this awareness is the first step toward embracing growth opportunities. With gentle encouragement to take on micro-challenges, you can continue to build momentum while preserving your inner balance."
text2 = "Aarthi, as a Persistent Hiker, you embody a balanced and thoughtful approach to learning that values steady progress and emotional well-being. Your calm and deliberate mindset helps you maintain harmony in your studies, allowing you to navigate academic challenges with maturity and self-awareness. While you naturally avoid stress and prefer comfort, this same tendency can sometimes hold you back from pushing beyond your limits. Embracing small, manageable challenges will help you grow without disrupting your inner balance, turning moments of discomfort into meaningful steps forward."

In [18]:
# t1="have you been to the new restaurant in town"
# t2="have you visited the new eatery in the city"
e1 = get_embedding(text1)
e2 = get_embedding(text2)
cos2 = TfidfVectorizer().fit_transform([text1, text2])
print("\n\ncosine:", cosine_similarity(cos2[0:1], cos2[1:2])[0][0])
cos = compute_cosine_from_embeddings(e1, e2)
print("\ncosine:", cos)



cosine: 0.5585502046006118

cosine: 0.9443697862511105
