In [1]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser, StrOutputParser
import boto3
import os
import re
import json
from datetime import datetime
from pydantic import BaseModel
import importlib
import shutil
from utils import utils
import pandas as pd
from utils.time_converter import TimeConverter
importlib.reload(utils)

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

LLM = ChatOpenAI(
    model='deepseek-chat', 
    openai_api_key=DEEPSEEK_API_KEY, 
    openai_api_base='https://api.deepseek.com',
)

In [16]:
subject = '수학'
room_id = '67514d9c4c8ca68c745c1fdf'

file_keys = utils.get_items('pagecall-text', f'{subject}/{room_id}')
utils.download_items('pagecall-text', file_keys, './downloads')

In [17]:
raw_data = utils.merge_files('./downloads')
shutil.rmtree('./downloads')

In [18]:
teacher_extracted_data = utils.extract_speaker(raw_data, speaker='teacher')
teacher_splited_data = utils.split_sentences(teacher_extracted_data)
student_extracted_data = utils.extract_speaker(raw_data, speaker='student')
student_splited_data = utils.split_sentences(student_extracted_data)

teacher_splited_data = utils.mapping_time(teacher_extracted_data, teacher_splited_data)
student_splited_data = utils.mapping_time(student_extracted_data, student_splited_data)
teacher_df = pd.DataFrame(teacher_splited_data).rename(columns={"idx": "teacher_idx", "text": "teacher_text"})
student_df = pd.DataFrame(student_splited_data).rename(columns={"idx": "student_idx", "text": "student_text"})

df = pd.concat([teacher_df, student_df], ignore_index=True)
df = df.sort_values(by=["start", "teacher_idx", "student_idx"]).reset_index(drop=True)
df = df.astype({'teacher_idx': 'Int64', 'student_idx': 'Int64'})
df = df[['start', 'end', 'teacher_idx', 'student_idx', 'time', 'teacher_text', 'student_text']]

teacher_df = df[df['teacher_text'].notnull()].drop(columns=['student_text', 'student_idx', 'start', 'end', 'time']).rename(columns={"teacher_idx": "idx", "teacher_text": "text"}).reset_index(drop=True)
student_df = df[df['student_text'].notnull()].drop(columns=['teacher_text', 'teacher_idx', 'start', 'end', 'time']).rename(columns={"student_idx": "idx", "student_text": "text"}).reset_index(drop=True)
df

Unnamed: 0,start,end,teacher_idx,student_idx,time,teacher_text,student_text
0,2024-12-11 09:47:16.58,2024-12-11 09:47:16.88,,0,0m 0.0s ~ 0m 0.3s,,지금까지 뉴스 스토리였습니다.
1,2024-12-11 10:00:12.97,2024-12-11 10:00:18.37,0,,12m 56.4s ~ 13m 1.8s,안녕하세요.,
2,2024-12-11 10:00:18.47,2024-12-11 10:00:24.37,1,,13m 1.9s ~ 13m 7.8s,일단 오늘은 첫 수업이니까 제 소개를 좀 하면 저는 김진 선생님이고요.,
3,2024-12-11 10:00:26.47,2024-12-11 10:00:39.27,2,,13m 9.9s ~ 13m 22.7s,저랑 수업할 거고 아마 개념 설명하고 문제풀이하는 방식으로 수업할 것 같고 모르는 ...,
4,2024-12-11 10:00:31.28,2024-12-11 10:00:54.88,,1,13m 14.7s ~ 13m 38.3s,,안녕하세요
...,...,...,...,...,...,...,...
705,2024-12-11 11:04:27.57,2024-12-11 11:04:44.67,483,,77m 11.0s ~ 77m 28.1s,할 수 있을 만큼 내줄 거고 그렇게 해서 한번 수업을 잘 해보도록 합시다 그러면 다...,
706,2024-12-11 11:04:27.57,2024-12-11 11:04:44.67,484,,77m 11.0s ~ 77m 28.1s,월요일에 보도록 합시다 수고하셨어요,
707,2024-12-11 11:04:27.57,2024-12-11 11:04:44.67,485,,77m 11.0s ~ 77m 28.1s,네,
708,2024-12-11 11:04:46.28,2024-12-11 11:05:02.18,,222,77m 29.7s ~ 77m 45.6s,,내 내 내 내 내 내


In [19]:
from prompt import question_checker#QuestionChecker
from prompt import question_classifier#QuestionClassifier
from prompt import teacher_digging
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser



def question_check(chunks_with_overlap, subject, user):
    system_prompt = question_checker.QuestionChecker(subject=subject, user=user).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | StrOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in chunks_with_overlap])
    indices = utils.extract_question_indices(results)
    return indices

def learning_question_check(question_context, subject, user):
    system_prompt = question_classifier.QuestionClassifier(subject=subject, user=user).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | JsonOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in question_context])
    indices = utils.extract_True_indices(results)
    return indices

def digging_question_check(learning_question_context, subject):
    system_prompt = teacher_digging.Digging(subject).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | JsonOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in learning_question_context])
    indices = utils.extract_True_indices(results)
    return indices

chunks_with_overlap = utils.split_with_overlap(teacher_df, chunk_size=30, overlap=5)
question_indices = question_check(chunks_with_overlap, subject, '선생님')
question_context = utils.get_question_context_v1(df, question_indices, 'teacher', 5)
learning_question_indices = learning_question_check(question_context, subject, '선생님')    
learning_question_context = utils.get_question_context_v2(df, learning_question_indices, 'teacher', 5)
digging_question_indices = digging_question_check(learning_question_context, subject)   
digging_question_context = utils.get_question_context_v2(df, digging_question_indices, 'teacher', 5)   


with open(f"{room_id}_선생님.txt", "w", encoding="utf-8-sig") as file:
    json.dump(digging_question_context, file, ensure_ascii=False, indent=4)




In [20]:
digging_question_context

[{'idx': 44,
  'question': '왜 19지?',
  'context': [{'time': '16m 43.9s ~ 16m 44.9s',
    'teacher_text': '1.3은 어디 있을까요?'},
   {'time': '16m 55.8s ~ 17m 20.4s', 'student_text': '요기요.'},
   {'time': '17m 3.7s ~ 17m 5.4s', 'teacher_text': '그치?'},
   {'time': '17m 21.8s ~ 17m 30.5s',
    'teacher_text': '마이너스 1을 분모가 8인 분수로 바꿔보면 위에 분자는 얼마죠?'},
   {'time': '17m 25.8s ~ 17m 37.1s', 'student_text': '중소를 잘 모르겠어요.'},
   {'time': '17m 38.4s ~ 17m 38.7s', 'teacher_text': '왜 19지?'},
   {'time': '17m 50.6s ~ 17m 56.1s', 'teacher_text': '어...'},
   {'time': '17m 50.6s ~ 17m 56.1s', 'teacher_text': '보자.'},
   {'time': '17m 53.4s ~ 17m 53.9s', 'student_text': '식구'},
   {'time': '17m 56.2s ~ 18m 3.0s', 'teacher_text': '분수...'},
   {'time': '17m 56.2s ~ 18m 3.0s', 'teacher_text': '분수의 개념은 알잖아.'}]},
 {'idx': 61,
  'question': '더 작다고 생각한 이유는? 음...',
  'context': [{'time': '18m 40.5s ~ 18m 46.8s',
    'teacher_text': '얘가 마이너스 8분의 8인 거 여기까지 이해되나요?'},
   {'time': '18m 47.4s ~ 18m 51.3s',
    'teacher_text': '

In [21]:
from prompt import question_checker#QuestionChecker
from prompt import question_classifier#QuestionClassifier
from prompt import student_concretizing
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser


def question_check(chunks_with_overlap, subject, user):
    system_prompt = question_checker.QuestionChecker(subject=subject, user=user).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | StrOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in chunks_with_overlap])
    indices = utils.extract_question_indices(results)
    return indices

def learning_question_check(question_context, subject, user):
    system_prompt = question_classifier.QuestionClassifier(subject=subject, user=user).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | JsonOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in question_context])
    indices = utils.extract_True_indices(results)
    return indices

def concretizing_question_check(learning_question_context, subject):
    system_prompt = student_concretizing.concretizing(subject).prompt
    prompt = ChatPromptTemplate.from_messages([
        ('system', system_prompt),
        ('user', "{user_message}")
    ])
    chain = prompt | LLM | JsonOutputParser()

    results = chain.batch([{"user_message": chunk} for chunk in learning_question_context])
    indices = utils.extract_True_indices(results)
    return indices

chunks_with_overlap = utils.split_with_overlap(student_df, chunk_size=30, overlap=5)
question_indices = question_check(chunks_with_overlap, subject, '학생')
question_context = utils.get_question_context_v1(df, question_indices, 'student', 5)
learning_question_indices = learning_question_check(question_context, subject, '학생')    
learning_question_context = utils.get_question_context_v2(df, learning_question_indices, 'student', 5)
concretizing_question_indices = concretizing_question_check(learning_question_context, subject)   
concretizing_question_context = utils.get_question_context_v2(df, concretizing_question_indices, 'student', 5)   


with open(f"{room_id}_학생.txt", "w", encoding="utf-8-sig") as file:
    json.dump(concretizing_question_context, file, ensure_ascii=False, indent=4)


In [22]:
concretizing_question_context

[{'idx': 262,
  'question': '근데 이게 순서대로 계산해야 되는지 아니면은 마이너스는 마이너스끼리 이렇게 계산해야 되는지 헷갈려서',
  'context': [{'time': '36m 45.2s ~ 37m 2.3s',
    'teacher_text': '그러면 이 부분에서는 최대공략수 최소공배수 구하는 방법이랑 걔네를 이렇게 구했으면 이 공식 최대공략수랑 최소공배수를 곱한 값이 원래 두수를 곱한 값과 같다'},
   {'time': '36m 45.2s ~ 37m 24.9s',
    'teacher_text': '는 그 공식 요거 알아야 문제를 풀 수 있었던 거고요 9번은 단순 계산일텐데 얘는 혹시 한번 다시 풀어볼래요?'},
   {'time': '37m 33.6s ~ 37m 34.3s', 'teacher_text': '상관없어요.'},
   {'time': '37m 36.1s ~ 37m 36.1s', 'teacher_text': '응.'},
   {'time': '37m 36.4s ~ 37m 48.0s',
    'teacher_text': '하고 싶은 대로 순서대로 해도 되고 마이너스는 마이너스끼리 해도 되고 어떻게 하든 답은 똑같이 나와요.'},
   {'time': '37m 43.0s ~ 37m 57.8s',
    'student_text': '근데 이게 순서대로 계산해야 되는지 아니면은 마이너스는 마이너스끼리 이렇게 계산해야 되는지 헷갈려서'},
   {'time': '37m 43.0s ~ 37m 57.8s', 'student_text': '아 그래요?'},
   {'time': '37m 57.8s ~ 37m 57.9s', 'student_text': '음 네'},
   {'time': '39m 11.6s ~ 39m 36.8s', 'teacher_text': '작은게 큰거에요'},
   {'time': '39m 11.6s ~ 39m 36.8s', 'teacher_text': '그치 응 맞아요'},
   {'time': '39

In [30]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langgraph.graph import Graph, StateGraph
from typing import TypedDict, Annotated, Sequence
import operator
import boto3
import os
import json
import shutil
import pandas as pd
from utils import utils
from prompt import question_checker, question_classifier, teacher_digging, student_concretizing

# Define state types
class ConversationState(TypedDict):
    subject: str
    room_id: str
    raw_data: str | None
    df: pd.DataFrame | None
    teacher_df: pd.DataFrame | None
    student_df: pd.DataFrame | None
    teacher_chunks: list | None
    student_chunks: list | None
    teacher_context: list | None
    student_context: list | None
    llm: any
    
def setup_llm():
    load_dotenv()
    DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
    
    return ChatOpenAI(
        model='deepseek-chat', 
        openai_api_key=DEEPSEEK_API_KEY, 
        openai_api_base='https://api.deepseek.com',
    )

def process_raw_data(state: ConversationState) -> ConversationState:
    # Get and process files
    file_keys = utils.get_items('pagecall-text', f'{state["subject"]}/{state["room_id"]}')
    utils.download_items('pagecall-text', file_keys, './downloads')
    
    raw_data = utils.merge_files('./downloads')
    shutil.rmtree('./downloads')
    
    state['raw_data'] = raw_data
    return state

def create_dataframe(state: ConversationState) -> ConversationState:
    raw_data = state['raw_data']
    
    # Process teacher data
    teacher_extracted_data = utils.extract_speaker(raw_data, speaker='teacher')
    teacher_splited_data = utils.split_sentences(teacher_extracted_data)
    teacher_splited_data = utils.mapping_time(teacher_extracted_data, teacher_splited_data)
    teacher_df = pd.DataFrame(teacher_splited_data).rename(columns={"idx": "teacher_idx", "text": "teacher_text"})
    
    # Process student data
    student_extracted_data = utils.extract_speaker(raw_data, speaker='student')
    student_splited_data = utils.split_sentences(student_extracted_data)
    student_splited_data = utils.mapping_time(student_extracted_data, student_splited_data)
    student_df = pd.DataFrame(student_splited_data).rename(columns={"idx": "student_idx", "text": "student_text"})
    
    # Combine and process final dataframe
    df = pd.concat([teacher_df, student_df], ignore_index=True)
    df = df.sort_values(by=["start", "teacher_idx", "student_idx"]).reset_index(drop=True)
    df = df.astype({'teacher_idx': 'Int64', 'student_idx': 'Int64'})
    df = df[['start', 'end', 'teacher_idx', 'student_idx', 'time', 'teacher_text', 'student_text']]
    
    state['df'] = df
    return state

def prepare_teacher_student_df(state: ConversationState) -> ConversationState:
    df = state['df']
    
    # Prepare teacher data
    teacher_df = df[df['teacher_text'].notnull()].drop(columns=['student_text', 'student_idx', 'start', 'end', 'time'])\
        .rename(columns={"teacher_idx": "idx", "teacher_text": "text"}).reset_index(drop=True)
    teacher_chunks = utils.split_with_overlap(teacher_df, chunk_size=30, overlap=5)
    
    # Prepare student data
    student_df = df[df['student_text'].notnull()].drop(columns=['teacher_text', 'teacher_idx', 'start', 'end', 'time'])\
        .rename(columns={"student_idx": "idx", "student_text": "text"}).reset_index(drop=True)
    student_chunks = utils.split_with_overlap(student_df, chunk_size=30, overlap=5)
    
    state['teacher_df'] = teacher_df
    state['student_df'] = student_df
    state['teacher_chunks'] = teacher_chunks
    state['student_chunks'] = student_chunks
    return state

def analyze_teacher_questions(state: ConversationState) -> ConversationState:
    df = state['df']
    chunks = state['teacher_chunks']
    subject = state['subject']
    llm = state['llm']
    
    # Question check chain
    question_prompt = ChatPromptTemplate.from_messages([
        ('system', question_checker.QuestionChecker(subject=subject, user='선생님').prompt),
        ('user', "{user_message}")
    ])
    question_chain = question_prompt | llm | StrOutputParser()
    
    # Get initial questions
    results = question_chain.batch([{"user_message": chunk} for chunk in chunks])
    question_indices = utils.extract_question_indices(results)
    question_context = utils.get_question_context_v1(df, question_indices, 'teacher', 5)
    
    # Learning question chain
    learning_prompt = ChatPromptTemplate.from_messages([
        ('system', question_classifier.QuestionClassifier(subject=subject, user='선생님').prompt),
        ('user', "{user_message}")
    ])
    learning_chain = learning_prompt | llm | JsonOutputParser()
    
    # Get learning questions
    results = learning_chain.batch([{"user_message": chunk} for chunk in question_context])
    learning_indices = utils.extract_True_indices(results)
    learning_context = utils.get_question_context_v2(df, learning_indices, 'teacher', 5)
    
    # Digging question chain
    digging_prompt = ChatPromptTemplate.from_messages([
        ('system', teacher_digging.Digging(subject).prompt),
        ('user', "{user_message}")
    ])
    digging_chain = digging_prompt | llm | JsonOutputParser()
    
    # Get digging questions
    results = digging_chain.batch([{"user_message": chunk} for chunk in learning_context])
    digging_indices = utils.extract_True_indices(results)
    
    state['teacher_context'] = utils.get_question_context_v2(df, digging_indices, 'teacher', 5)
    return state

def analyze_student_questions(state: ConversationState) -> ConversationState:
    df = state['df']
    chunks = state['student_chunks']
    subject = state['subject']
    llm = state['llm']
    
    # Question check chain
    question_prompt = ChatPromptTemplate.from_messages([
        ('system', question_checker.QuestionChecker(subject=subject, user='학생').prompt),
        ('user', "{user_message}")
    ])
    question_chain = question_prompt | llm | StrOutputParser()
    
    # Get initial questions
    results = question_chain.batch([{"user_message": chunk} for chunk in chunks])
    question_indices = utils.extract_question_indices(results)
    question_context = utils.get_question_context_v1(df, question_indices, 'student', 5)
    
    # Learning question chain
    learning_prompt = ChatPromptTemplate.from_messages([
        ('system', question_classifier.QuestionClassifier(subject=subject, user='학생').prompt),
        ('user', "{user_message}")
    ])
    learning_chain = learning_prompt | llm | JsonOutputParser()
    
    # Get learning questions
    results = learning_chain.batch([{"user_message": chunk} for chunk in question_context])
    learning_indices = utils.extract_True_indices(results)
    learning_context = utils.get_question_context_v2(df, learning_indices, 'student', 5)
    
    # Concretizing question chain
    concretizing_prompt = ChatPromptTemplate.from_messages([
        ('system', student_concretizing.concretizing(subject).prompt),
        ('user', "{user_message}")
    ])
    concretizing_chain = concretizing_prompt | llm | JsonOutputParser()
    
    # Get concretizing questions
    results = concretizing_chain.batch([{"user_message": chunk} for chunk in learning_context])
    concretizing_indices = utils.extract_True_indices(results)
    
    state['student_context'] = utils.get_question_context_v2(df, concretizing_indices, 'student', 5)
    return state

def save_results(state: ConversationState) -> ConversationState:
    # Save teacher results
    with open(f"{state['room_id']}_선생님.txt", "w", encoding="utf-8-sig") as file:
        json.dump(state['teacher_context'], file, ensure_ascii=False, indent=4)
    
    # Save student results
    with open(f"{state['room_id']}_학생.txt", "w", encoding="utf-8-sig") as file:
        json.dump(state['student_context'], file, ensure_ascii=False, indent=4)
    
    return state

def create_conversation_graph() -> Graph:
    # Create workflow graph
    workflow = StateGraph(ConversationState)
    
    # Add nodes
    workflow.add_node("process_raw_data", process_raw_data)
    workflow.add_node("create_dataframe", create_dataframe)
    workflow.add_node("prepare_teacher_student_df", prepare_teacher_student_df)
    workflow.add_node("analyze_teacher_questions", analyze_teacher_questions)
    workflow.add_node("analyze_student_questions", analyze_student_questions)
    workflow.add_node("save_results", save_results)
    
    # Define edges
    workflow.add_edge("process_raw_data", "create_dataframe")
    workflow.add_edge("create_dataframe", "prepare_teacher_student_df")
    workflow.add_edge("prepare_teacher_student_df", "analyze_teacher_questions")
    workflow.add_edge("analyze_teacher_questions", "analyze_student_questions")
    workflow.add_edge("analyze_student_questions", "save_results")
    
    # Set entry and end points
    workflow.set_entry_point("process_raw_data")
    workflow.set_finish_point("save_results")
    
    return workflow.compile()

def main(subject: str, room_id: str):
    # Initialize state
    state = ConversationState(
        subject=subject,
        room_id=room_id,
        raw_data=None,
        df=None,
        teacher_df=None,
        student_df=None,
        teacher_chunks=None,
        student_chunks=None,
        teacher_context=None,
        student_context=None,
        llm=setup_llm()
    )
    
    # Create and run graph
    graph = create_conversation_graph()
    graph.invoke(state)


In [None]:

subject = "수학"  # Example subject
room_id = "67514d9c4c8ca68c745c1fdf"  # Example room_id
main(subject, room_id)