In [1]:
import os
import json
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from datetime import datetime


In [2]:
# Load .env environment variables
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [3]:
def initialize_llm():
    provider = os.getenv("MODEL_PROVIDER")
    if provider == "GROQ":
        print(1)
        # Initialize the Groq LLM
        llm = ChatGroq(
            model= os.getenv("GROQ_MODEL"),
            api_key=os.getenv("GROQ_API_KEY"),
            temperature=0.1,
            max_tokens=5000,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
        )
    elif provider == "OPENAI":
        # Initialize the OpenAI LLM
        llm = ChatOpenAI(
            model= os.getenv("OPENAI_MODEL"),
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            temperature=0.1,
            max_tokens=5000,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
        )
    return llm

In [4]:
llm = initialize_llm()

In [None]:
PROMPT = """
You are an expert educational content summarizer. Given multiple URLs of educational webpages, read the content from each URL carefully and generate a comprehensive, clear summary suitable for a student preparing for an exam.

Important: Your entire summary must be written in your own words with zero direct copying from any source - rewrite every concept using your own explanations and language.

**Instructions:**
1. Read and analyze the content from each provided URL
2. Identify common themes, concepts, and techniques across all sources
3. Synthesize the information into a cohesive summary that covers all important points
4. If sources cover different aspects of the same topic, integrate them seamlessly
5. If sources cover completely different topics, organize them into separate sections

The summary should include:  
 - A brief overview of the topic(s) or skill(s) covered across all sources
 - The key concepts or techniques explained in the lessons, consolidated from all URLs
 - Actionable study tips to help the student understand and apply these concepts effectively
 - Types of exercises the student should practice based on these topics — describe them clearly
 - Common mistakes or pitfalls to avoid while applying the concepts

Do not copy text verbatim from any source; rewrite all ideas in your own words. Keep the summary structured with headings or bullet points if appropriate, and ensure it is easy to understand.

**URLs to analyze:**
{URLS}
"""

In [6]:
def template_generator(urls:list):

    template_data = {
        "URLS": urls
    }

    return PROMPT.format(**template_data)

In [9]:
links_resource = r"C:\Users\Manideep S\Downloads\L@\SAT Paid Report\Users_data\\RohanByali\\Input_data.json"

In [10]:
with open(links_resource, 'r', encoding='utf-8') as json_file:
    json_data = json.load(json_file)

In [11]:
rw_skills = json_data["report"]["sat_readiness_report"]["tabs"][3]["subjects"][0]["sections"][0]["section_details"]
math_skills = json_data["report"]["sat_readiness_report"]["tabs"][3]["subjects"][1]["sections"][0]["section_details"]

In [12]:
rw_skills_links_summaries = []
for skill in rw_skills:
    # rw_skills_links.append({skill["name"]:skill["educational_resources"]})
    prompt_template = template_generator(skill["educational_resources"])
    response = llm.invoke(prompt_template)
    content = response.content if hasattr(response, "content") else response
    processed_skill = {
        "name": skill["name"],
        "link": skill["educational_resources"],
        "summary": content
    }
    rw_skills_links_summaries.append(processed_skill)

In [14]:
with open("rw_skills_links_summaries.json", "w", encoding="utf-8") as f:
    json.dump(rw_skills_links_summaries, f, ensure_ascii=False, indent=4)

In [13]:
math_skills_links_summaries = []
for skill in math_skills:
    # rw_skills_links.append({skill["name"]:skill["educational_resources"]})
    prompt_template = template_generator(skill["educational_resources"])
    response = llm.invoke(prompt_template)
    content = response.content if hasattr(response, "content") else response
    processed_skill = {
        "name": skill["name"],
        "link": skill["educational_resources"],
        "summary": content
    }
    math_skills_links_summaries.append(processed_skill)

In [15]:
with open("math_skills_links_summaries.json", "w", encoding="utf-8") as f:
    json.dump(math_skills_links_summaries, f, ensure_ascii=False, indent=4)

In [20]:
input_files_path = r"C:\Users\Manideep S\Downloads\L@\SAT Paid Report\Users_data"

In [21]:
import glob

input_files = glob.glob(os.path.join(input_files_path, "*", "Input_data.json"))

In [22]:
print(input_files)

['C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Aarthi\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Abhinav\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Amrita\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Anirudh\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\GouriPradeep\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\GovindPotti\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Ishan\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\IshanaPotti\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Jevinn\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SAT Paid Report\\Users_data\\Lakshmi\\Input_data.json', 'C:\\Users\\Manideep S\\Downloads\\L@\\SA

In [23]:
for file_path in input_files:
    with open(file_path, 'r+', encoding='utf-8') as f:
        data = json.load(f)
        # print(data["report"]["sat_readiness_report"]["tabs"][3]["subjects"][0]["sections"][0]["section_details"])
        # RW
        for skill in rw_skills_links_summaries:
            for data_skill in data["report"]["sat_readiness_report"]["tabs"][3]["subjects"][0]["sections"][0]["section_details"]:
                if data_skill["name"] == skill["name"]:
                    data_skill["educational_resources"] = []
                    data_skill["educational_resources"] = [
                        {
                            "name": skill["name"],
                            "link": skill["link"],
                            "summary": skill["summary"]
                        }
                    ]

        # MATH
        for skill in math_skills_links_summaries:
            for data_skill in data["report"]["sat_readiness_report"]["tabs"][3]["subjects"][1]["sections"][0]["section_details"]:
                if data_skill["name"] == skill["name"]:
                    data_skill["educational_resources"] = []
                    data_skill["educational_resources"] = [
                        {
                            "name": skill["name"],
                            "link": skill["link"],
                            "summary": skill["summary"]
                        }
                    ]

        f.seek(0)
        json.dump(data, f, ensure_ascii=False, indent=4)
        f.truncate()

In [None]:
content_array = {}
for each_skill in key_value_array:
    prompt_template = template_generator(each_skill[1])
    response = llm.invoke(prompt_template)
    content = response.content if hasattr(response, "content") else response
    content_array[each_skill[0]] = content

In [17]:
# print(len(content_array))

In [None]:
for summary in content_array:
    with open("math_"+summary+"_summary.txt", "w", encoding="utf-8") as f:
        f.write(content_array[summary])