In [1]:
import os, json, re
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import openai
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer




load_dotenv()

True

In [2]:
# common_words_path=r"C:\Users\Manideep S\Downloads\L@\SAT Readiness Report\common_words.txt"

In [3]:
# def extract_common_words(common_words_path: str) -> list:
#     with open(common_words_path, 'r') as f:
#         return [line.strip().lower() for line in f if line.strip()]

In [4]:
def flatten_description(desc) -> str:
    if isinstance(desc, str):
        return desc
    elif isinstance(desc, list):
        return ' '.join(flatten_description(item) for item in desc)
    elif isinstance(desc, dict):
        return ' '.join(f"{k} {flatten_description(v)}" for k, v in desc.items())
    else:
        return str(desc)


In [5]:
def clean_text(text) -> str:
    text = flatten_description(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
def get_embedding(text: str, model: str = "text-embedding-3-small") -> list:
    response = openai.embeddings.create(model=model, input=text)
    return response.data[0].embedding

In [7]:
def compute_cosine_from_embeddings(e1, e2) -> float:
    return cosine_similarity([e1], [e2])[0][0]

In [8]:
def flatten(content):
        if isinstance(content, str):
            return content
        elif isinstance(content, list):
            return " ".join(flatten(item) for item in content)
        elif isinstance(content, dict):
            return " ".join(flatten(v) for v in content.values())
        return ""

## Code


In [9]:
def extract_sectionsz(data):
    sections = {}
    
    # Helper function to find section by title
    def find_section_by_title(data, title):
        for item in data:
            if isinstance(item, dict) and item.get("title") == title:
                return item
            elif isinstance(item, list):
                for sub_item in item:
                    if isinstance(sub_item, dict) and sub_item.get("title") == title:
                        return sub_item
        return None
    
    # Helper function to find nested section by title within a parent section
    def find_nested_section(parent_section, title):
        if not parent_section or "description" not in parent_section:
            return None
        
        desc = parent_section["description"]
        if isinstance(desc, list):
            for item in desc:
                if isinstance(item, dict) and item.get("title") == title:
                    return item
        return None
    
    # Helper function to find skill areas within a section
    def find_skill_areas(parent_section):
        skill_areas_section = find_nested_section(parent_section, "Skill Areas")
        if skill_areas_section and "description" in skill_areas_section:
            rw_section = None
            math_section = None
            
            for item in skill_areas_section["description"]:
                if isinstance(item, dict):
                    if item.get("title") == "Reading & Writing":
                        rw_section = item.get("description", "")
                    elif item.get("title") == "Math":
                        math_section = item.get("description", "")
            
            return rw_section, math_section
        return None, None
    
    # Helper function to find practice methods within a section
    def find_practice_methods(parent_section):
        practice_section = find_nested_section(parent_section, "Practice Methods")
        if practice_section and "description" in practice_section:
            rw_methods = None
            math_methods = None
            
            for item in practice_section["description"]:
                if isinstance(item, dict):
                    if item.get("title") == "Reading & Writing":
                        rw_methods = item.get("description", "")
                    elif item.get("title") == "Math":
                        math_methods = item.get("description", "")
            
            return rw_methods, math_methods
        return None, None
    
    # Extract Executive Summary (from the nested list at the end)
    exec_summary = find_section_by_title(data, "Executive Summary")
    if exec_summary:
        sections["Executive Summary"] = exec_summary["description"]
    
    # Extract Key Strengths
    key_strengths = find_section_by_title(data, "Key Strengths")
    if key_strengths and "description" in key_strengths:
        desc = key_strengths["description"]
        sections["Key Strengths RW"] = desc.get("Reading & Writing", [])
        sections["Key Strengths Math"] = desc.get("Math", [])
    
    # Extract Focus Areas for Growth
    focus_areas = find_section_by_title(data, "Focus Areas for Growth")
    if focus_areas and "description" in focus_areas:
        desc = focus_areas["description"]
        sections["Focus Area RW"] = desc.get("Reading & Writing", [])
        sections["Focus Area Math"] = desc.get("Math", [])
    
    # Extract Bottom 3 Skills Resources
    resources = find_section_by_title(data, "Bottom 3 Skills Resources")
    if resources and "description" in resources:
        desc = resources["description"]
        sections["Bottom three resources RW"] = desc.get("Reading & Writing", [])
        sections["Bottom three resources Math"] = desc.get("Math", [])
    
    # Extract Foundation Phase
    foundation = find_section_by_title(data, "Foundation Phase")
    if foundation:
        # Weekly Goals
        weekly_goals = find_nested_section(foundation, "Weekly Goals")
        if weekly_goals:
            sections["foundation weekly goals"] = weekly_goals["description"]
        
        # Skill Areas
        rw_skills, math_skills = find_skill_areas(foundation)
        sections["foundation skill areas RW"] = rw_skills
        sections["foundation skill areas Math"] = math_skills
        
        # Practice Methods
        rw_methods, math_methods = find_practice_methods(foundation)
        sections["foundation practice methods RW"] = rw_methods
        sections["foundation practice methods Math"] = math_methods
        
        # Weekly Structure
        weekly_structure = find_nested_section(foundation, "Weekly Structure")
        if weekly_structure:
            sections["foundation Weekly structure"] = weekly_structure["description"]
        
        # Flex Day Activities
        flex_day = find_nested_section(foundation, "Flex Day Activities")
        if flex_day:
            sections["foundation Flex Day"] = flex_day["description"]
    
    # Extract Elevation Phase
    elevation = find_section_by_title(data, "Elevation Phase")
    if elevation:
        # Weekly Goals
        weekly_goals = find_nested_section(elevation, "Weekly Goals")
        if weekly_goals:
            sections["elevation weekly goals"] = weekly_goals["description"]
        
        # Skill Areas
        rw_skills, math_skills = find_skill_areas(elevation)
        sections["elevation skill areas RW"] = rw_skills
        sections["elevation skill areas Math"] = math_skills
        
        # Practice Methods
        rw_methods, math_methods = find_practice_methods(elevation)
        sections["elevation practice methods RW"] = rw_methods
        sections["elevation practice methods Math"] = math_methods
        
        # Weekly Structure
        weekly_structure = find_nested_section(elevation, "Weekly Structure")
        if weekly_structure:
            sections["elevation Weekly structure"] = weekly_structure["description"]
        
        # Flex Day Activities
        flex_day = find_nested_section(elevation, "Flex Day Activities")
        if flex_day:
            sections["elevation Flex Day"] = flex_day["description"]
    
    # Extract Peak Phase
    peak = find_section_by_title(data, "Peak Phase")
    if peak:
        # Weekly Goals
        weekly_goals = find_nested_section(peak, "Weekly Goals")
        if weekly_goals:
            sections["peak weekly goals"] = weekly_goals["description"]
        
        # Skill Areas
        rw_skills, math_skills = find_skill_areas(peak)
        sections["peak skill areas RW"] = rw_skills
        sections["peak skill areas Math"] = math_skills
        
        # Practice Methods
        rw_methods, math_methods = find_practice_methods(peak)
        sections["peak practice methods RW"] = rw_methods
        sections["peak practice methods Math"] = math_methods
        
        # Weekly Structure
        weekly_structure = find_nested_section(peak, "Weekly Structure")
        if weekly_structure:
            sections["peak Weekly structure"] = weekly_structure["description"]
        
        # Flex Day Activities
        flex_day = find_nested_section(peak, "Flex Day Activities")
        if flex_day:
            sections["peak Flex Day"] = flex_day["description"]
    
    # Extract Tips to Make It Work Best
    tips = find_section_by_title(data, "Tips to Make It Work Best")
    if tips:
        sections["tips"] = tips["description"]
    
    # Extract Words of Encouragement
    encouragement = find_section_by_title(data, "Words of Encouragement")
    if encouragement:
        sections["words of encouragement"] = encouragement["description"]
    
    return sections

# Test the function with your data
# sections = extract_sections(your_json_data)
# print(sections)

In [10]:
def process_executive_summary(data):
    result = {}

    for item in data.get("description", []):
        title = item.get("title", "")
        desc = item.get("description", "")

        if title == "Summary":
            result["Executive_summary_summary"] = desc

        elif title == "Key Strengths":
            rw = desc.get("Reading & Writing", [])
            math = desc.get("Math", [])
            result["Key_strengths_RW"] = ", ".join(rw)
            result["Key_strengths_Math"] = ", ".join(math)

        elif title == "Focus Areas for Growth":
            rw = desc.get("Reading & Writing", [])
            math = desc.get("Math", [])
            result["FocusAreas_RW"] = ", ".join(rw)
            result["FocusAreas_Math"] = ", ".join(math)

        elif title == "Study Plan":
            result["StudyPlan"] = desc

    return result

In [11]:
def process_foundation_phase(data):
    foundation = {}

    # Get Weekly Goals
    for section in data["description"]:
        if section["title"] == "Weekly Goals":
            foundation["foundation_WeeklyGoals"] = section["description"]

    # Get Skill Areas
    for section in data["description"]:
        if section["title"] == "Skill Areas":
            for skill in section["description"]:
                if skill["title"] == "Reading & Writing":
                    foundation["SkillAreas_RW"] = skill["description"]
                elif skill["title"] == "Math":
                    foundation["SkillAreas_Math"] = skill["description"]

    # Get Practice Methods
    for section in data["description"]:
        if section["title"] == "Practice Methods":
            for method in section["description"]:
                if method["title"] == "Reading & Writing":
                    foundation["foundation_PracticeMethods_RW"] = method["description"]
                elif method["title"] == "Math":
                    foundation["foundation_PracticeMethods_Math"] = method["description"]

    # Get Weekly Structure
    for section in data["description"]:
        if section["title"] == "Weekly Structure":
            foundation["foundation_weekly_structure"] = section["description"]

    # Get Flex Day Activities
    for section in data["description"]:
        if section["title"] == "Flex Day Activities":
            foundation["foundation_flexday_activities"] = section["description"]

    return foundation


In [12]:
def process_elevation_phase(data):
    elevation = {}

    for section in data.get("description", []):
        if section["title"] == "Weekly Goals":
            elevation["elevation_WeeklyGoals"] = section["description"]

        elif section["title"] == "Skill Areas":
            for skill in section["description"]:
                if skill["title"] == "Reading & Writing":
                    elevation["elevation_SkillAreas_RW"] = skill["description"]
                elif skill["title"] == "Math":
                    elevation["elevation_SkillAreas_Math"] = skill["description"]

        elif section["title"] == "Practice Methods":
            for method in section["description"]:
                if method["title"] == "Reading & Writing":
                    elevation["elevation_PracticeMethods_RW"] = method["description"]
                elif method["title"] == "Math":
                    elevation["elevation_PracticeMethods_Math"] = method["description"]

        elif section["title"] == "Weekly Structure":
            elevation["elevation_WeeklyStructure"] = section["description"]

        elif section["title"] == "Flex Day Activities":
            elevation["elevation_FlexDayActivities"] = section["description"]

    return elevation


In [13]:
def process_peak_phase(data):
    peak = {}

    for section in data.get("description", []):
        if section["title"] == "Weekly Goals":
            peak["peak_WeeklyGoals"] = section["description"]

        elif section["title"] == "Skill Areas":
            for skill in section["description"]:
                if skill["title"] == "Reading & Writing":
                    peak["peak_SkillAreas_RW"] = skill["description"]
                elif skill["title"] == "Math":
                    peak["peak_SkillAreas_Math"] = skill["description"]

        elif section["title"] == "Practice Methods":
            for method in section["description"]:
                if method["title"] == "Reading & Writing":
                    peak["peak_PracticeMethods_RW"] = method["description"]
                elif method["title"] == "Math":
                    peak["peak_PracticeMethods_Math"] = method["description"]

        elif section["title"] == "Weekly Structure":
            peak["peak_WeeklyStructure"] = section["description"]

        elif section["title"] == "Flex Day Activities":
            peak["peak_FlexDayActivities"] = section["description"]

    return peak


In [14]:
def process_tips_to_make_it_work_best(data):
    tips_list = data.get("description", [])
    return "\n".join(tips_list)

In [15]:
def process_this_journey_of_yours(data):
    return data.get("description", "")

In [16]:
def process_footer(data):
    return data.get("description", [])

In [17]:
def process_static_cta(data): 
    return data.get("description", [])

In [18]:
def compute_bleu_openai_cosine_scores(user_path: str):
    SECTION_PROCESSORS = {
        "EXECUTIVE_SUMMARY": process_executive_summary,
        "FOUNDATION_PHASE": process_foundation_phase,
        "ELEVATION_PHASE": process_elevation_phase,
        "PEAK_PHASE": process_peak_phase,
        "TIPS_TO_MAKE_IT_WORK_BEST": process_tips_to_make_it_work_best,
        "THIS_JOURNEY_OF_YOURS": process_this_journey_of_yours,
        "STATIC_CTA": process_static_cta,
        "FOOTER": process_footer
    }

    plans = [
        f for f in os.listdir(user_path)
        if os.path.isdir(os.path.join(user_path, f)) and f.startswith("Detailed_Study_Plan")
    ]

    if len(plans) < 2:
        raise ValueError("Need at least two study plans to compare.")

    all_data = {}

    for plan in plans:
        plan_path = os.path.join(user_path, plan)
        section_files = [f for f in os.listdir(plan_path) if f.endswith(".json")]

        merged_sections = {}
        for section_file in section_files:
            section_path = os.path.join(plan_path, section_file)
            with open(section_path, 'r', encoding='utf-8') as f:
                section_data = json.load(f)

            section_name = os.path.splitext(section_file)[0].upper()  # Normalize case

            if section_name in SECTION_PROCESSORS:
                processed_text = SECTION_PROCESSORS[section_name](section_data)
                merged_sections[section_name] = clean_text(processed_text)
            else:
                print(f"[Warning] No processing function defined for section: {section_name}")
                continue

        all_data[plan] = merged_sections

    all_sections = sorted({title for d in all_data.values() for title in d.keys()})
    smoothing = SmoothingFunction().method1
    rows = []

    for plan1, plan2 in combinations(plans, 2):
        row = {'Plan1': plan1, 'Plan2': plan2}
        full1, full2 = [], []

        for section in all_sections:
            t1 = all_data[plan1].get(section, "")
            t2 = all_data[plan2].get(section, "")

            print("\nSection:", section)
            print("Text1:", t1)
            print("Text2:", t2)

            full1.append(t1)
            full2.append(t2)

            bleu = sentence_bleu([t1.split()], t2.split(), smoothing_function=smoothing)
            row[f"{section}_BLEU"] = round(bleu, 4)

            e1 = get_embedding(t1)
            e2 = get_embedding(t2)
            cos = compute_cosine_from_embeddings(e1, e2)
            row[f"{section}_Cosine"] = round(cos, 4)

        joined1 = " ".join(full1)
        joined2 = " ".join(full2)

        bleu_full = sentence_bleu([joined1.split()], joined2.split(), smoothing_function=smoothing)
        e1_full = get_embedding(joined1)
        e2_full = get_embedding(joined2)
        cos_full = compute_cosine_from_embeddings(e1_full, e2_full)

        row["FULL_TEXT_BLEU"] = round(bleu_full, 4)
        row["FULL_TEXT_Cosine"] = round(cos_full, 4)

        rows.append(row)

    return pd.DataFrame(rows)


In [19]:
USERS = ["GovindPotti", "Ishan", "IshanaPotti", "Jevinn", "Lakshmi", "Meenakshi", "RohanBharathwaj", "SaiSaahas", "Tara", "Toni", "Vaishnavi", "Zoha"]

In [20]:
#### Single user example to test the code:

for user in USERS:
    path = r"C:\Users\Manideep S\Downloads\L@\SAT Paid Report\Users_data\{user}".format(user=user)
    df = pd.DataFrame()

    path_user = path
    if os.path.isdir(path_user):
        print(f"Processing user folder: {path_user}")
        try:
            results = compute_bleu_openai_cosine_scores(path_user)
            results['User'] = user  # Add a column to track which user's folder it was
            df = pd.concat([df, results], ignore_index=True)
        except Exception as e:
            print(f"Error processing {user}: {e}")
    # Define output file path
    output_path = os.path.join(path, 'bleu_cosine_results.csv')

    # Delete the file if it already exists
    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"Deleted existing file: {output_path}")

    # Save the new results to CSV
    df.to_csv(output_path, index=False)
    print(f"Saved results to {output_path}")

Processing user folder: C:\Users\Manideep S\Downloads\L@\SAT Paid Report\Users_data\GovindPotti

Section: ELEVATION_PHASE
Text1: elevation weeklygoals achieve measurable improvement in accuracy and reasoning across prioritized reading writing and math skills steadily reduce careless and repeat mistakes through structured reflection and enhance cognitive flexibility under moderate time pressure strengthen command of evidence and algebraic problem solving while mastering geometry and proportional reasoning build resilience by embracing rapid learning cycles and consistently hitting incremental performance and learning milestones toward the 1590 goal elevation skillareas rw command of evidence textual transitions text structure and purpose elevation skillareas math linear equations in one variable lines angles and triangles area and volume ratios rates proportional relationships and units percentages elevation practicemethods rw command of evidence textual engage in timed sets of challeng

In [21]:
import pandas as pd
from IPython.display import display

# Calculate the mean for each column (excluding non-numeric columns)
df_avg = df.select_dtypes(include='number').mean().to_frame().T

# Convert to percentages (multiply by 100)
df_avg_percent = df_avg * 100

# Beautiful print


pd.set_option('display.float_format', '{:,.2f}'.format)
display(df_avg_percent)

Unnamed: 0,ELEVATION_PHASE_BLEU,ELEVATION_PHASE_Cosine,EXECUTIVE_SUMMARY_BLEU,EXECUTIVE_SUMMARY_Cosine,FOOTER_BLEU,FOOTER_Cosine,FOUNDATION_PHASE_BLEU,FOUNDATION_PHASE_Cosine,PEAK_PHASE_BLEU,PEAK_PHASE_Cosine,STATIC_CTA_BLEU,STATIC_CTA_Cosine,THIS_JOURNEY_OF_YOURS_BLEU,THIS_JOURNEY_OF_YOURS_Cosine,TIPS_TO_MAKE_IT_WORK_BEST_BLEU,TIPS_TO_MAKE_IT_WORK_BEST_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
0,46.11,98.18,98.47,99.98,100.0,100.0,40.98,98.38,46.2,98.61,100.0,100.0,28.22,92.18,64.92,98.11,55.1,98.77


In [22]:
# Group by 'User' and calculate mean for each user
df_user_avg = df.groupby('User').mean(numeric_only=True)

# Convert to percentages
df_user_avg_percent = df_user_avg * 100

# Calculate overall average for each column
df_overall_avg_percent = df_user_avg_percent.mean().to_frame().T
df_overall_avg_percent.index = ['Overall Average']

# Display user-wise averages and overall average
pd.set_option('display.float_format', '{:,.2f}'.format)
display(df_user_avg_percent)
display(df_overall_avg_percent)

Unnamed: 0_level_0,ELEVATION_PHASE_BLEU,ELEVATION_PHASE_Cosine,EXECUTIVE_SUMMARY_BLEU,EXECUTIVE_SUMMARY_Cosine,FOOTER_BLEU,FOOTER_Cosine,FOUNDATION_PHASE_BLEU,FOUNDATION_PHASE_Cosine,PEAK_PHASE_BLEU,PEAK_PHASE_Cosine,STATIC_CTA_BLEU,STATIC_CTA_Cosine,THIS_JOURNEY_OF_YOURS_BLEU,THIS_JOURNEY_OF_YOURS_Cosine,TIPS_TO_MAKE_IT_WORK_BEST_BLEU,TIPS_TO_MAKE_IT_WORK_BEST_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Zoha,46.11,98.18,98.47,99.98,100.0,100.0,40.98,98.38,46.2,98.61,100.0,100.0,28.22,92.18,64.92,98.11,55.1,98.77


Unnamed: 0,ELEVATION_PHASE_BLEU,ELEVATION_PHASE_Cosine,EXECUTIVE_SUMMARY_BLEU,EXECUTIVE_SUMMARY_Cosine,FOOTER_BLEU,FOOTER_Cosine,FOUNDATION_PHASE_BLEU,FOUNDATION_PHASE_Cosine,PEAK_PHASE_BLEU,PEAK_PHASE_Cosine,STATIC_CTA_BLEU,STATIC_CTA_Cosine,THIS_JOURNEY_OF_YOURS_BLEU,THIS_JOURNEY_OF_YOURS_Cosine,TIPS_TO_MAKE_IT_WORK_BEST_BLEU,TIPS_TO_MAKE_IT_WORK_BEST_Cosine,FULL_TEXT_BLEU,FULL_TEXT_Cosine
Overall Average,46.11,98.18,98.47,99.98,100.0,100.0,40.98,98.38,46.2,98.61,100.0,100.0,28.22,92.18,64.92,98.11,55.1,98.77


# Cosine Similarity checker


In [23]:
text1 = "Aarthi, as a Persistent Hiker, you bring a calm, balanced approach to your SAT preparation that values steady progress and emotional well-being. Your thoughtful and deliberate mindset helps you maintain harmony in your studies, allowing you to stay resilient even when challenges arise. You naturally prioritize mental health and sustainable growth, which is a powerful strength in managing long-term goals. At times, you might find yourself avoiding intense academic pressure or difficult topics, preferring comfort over stretch, but this awareness is the first step toward embracing growth opportunities. With gentle encouragement to take on micro-challenges, you can continue to build momentum while preserving your inner balance."
text2 = "Aarthi, as a Persistent Hiker, you embody a balanced and thoughtful approach to learning that values steady progress and emotional well-being. Your calm and deliberate mindset helps you maintain harmony in your studies, allowing you to navigate academic challenges with maturity and self-awareness. While you naturally avoid stress and prefer comfort, this same tendency can sometimes hold you back from pushing beyond your limits. Embracing small, manageable challenges will help you grow without disrupting your inner balance, turning moments of discomfort into meaningful steps forward."

In [24]:
# t1="have you been to the new restaurant in town"
# t2="have you visited the new eatery in the city"
e1 = get_embedding(text1)
e2 = get_embedding(text2)
cos2 = TfidfVectorizer().fit_transform([text1, text2])
print("\n\ncosine:", cosine_similarity(cos2[0:1], cos2[1:2])[0][0])
cos = compute_cosine_from_embeddings(e1, e2)
print("\ncosine:", cos)



cosine: 0.5585502046006118

cosine: 0.9444101683553968
