In [1]:
from models import *
from utils import *

In [31]:
student_img_path = "../data/student_images"
instructor_img_path = "../data/instructor_images"

prompt_path = "../prompts/csv_promt.txt"
csv_output = "output.csv"

student_response = process_form(prompt_path, student_img_path)
instructor_response = process_form(prompt_path, instructor_img_path)

In [32]:
dict_to_csv(student_response, "out.csv")

In [33]:
add_instruct_answer_to_csv(instructor_response, "out.csv")

In [7]:
def evaluate_question(question, student_answer, instructor_answer):
    gpt3_model = TextModel(model_name="gpt-3.5-turbo-0125")
    router_prompt = (f"Given the question: '{question}', evaluate if the student's answer: '{student_answer}' "
              f"is 100% correct against the instructor's answer: '{instructor_answer}'. "
              f"Return '(correct:1)' if the student's answer is fully correct, or '(correct:0)'\
                  if the answer is incorrect or if you are unsure.")
    first_evaluation = gpt3_model.complete(prompt=router_prompt, role="user")
    eval_result = 1 if "(correct:1)" in first_evaluation else 0
    if eval_result: return "Correct answer"

    gpt4_model = TextModel(model_name="gpt-4-1106-preview")
    cot_prompt = (
        f"Given the question: '{question}', evaluate step by step and succinctly answer why the student's answer: '{student_answer}' "
              f"is partially or completely wrong against the instructor's answer: '{instructor_answer}'. "
    )
    second_evaluation = gpt4_model.complete(cot_prompt)
    return second_evaluation

In [6]:
TextModel(model_name="gpt-4-1106-preview")

<models.TextModel at 0x117742170>

In [5]:
import PyPDF2
def extract_questions(pdf_path):
    questions = []
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            lines = text.split('\n')
            for line in lines:
                if line.strip().endswith('?') or 'Solve:' in line or 'Evaluate:' in line or 'Simplify:' in line or 'Use the' in line or 'Find an' in line or 'Graph the' in line or 'Expand:' in line:
                    questions.append(line.strip())
    return questions

# Path to the PDF file
pdf_path = 'Questions.pdf'  # Change this to the actual path of your PDF file
questions = extract_questions(pdf_path)

In [6]:
from openai import OpenAI
import pandas as pd

# Initialize the OpenAI client
client = OpenAI()

# Function to get embeddings
def get_embedding(text, model="text-embedding-3-small"):
    client = OpenAI()
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Assuming 'questions' is your list of question strings
questions_df = pd.DataFrame(questions, columns=['question'])

# Apply the embedding function to each question
questions_df['embedding'] = questions_df['question'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


In [7]:
import pandas as pd
import numpy as np

def read_embeddings(csv_path):
    df = pd.read_csv(csv_path)
    # Assuming embeddings are stored as string representations of lists
    df['embedding'] = df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=','))
    return df

# Function to manually compute cosine similarity
def cosine_similarity_manual(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

# Function to compute cosine similarity and return top k matches
def top_k_matched_questions(query, k=5):
    # Path to the CSV file
    csv_path = 'embedded_questions.csv'
    questions_df = read_embeddings(csv_path)
    
    # Get embedding for the query
    query_embedding = get_embedding(query)  # Assuming get_embedding is defined elsewhere and set up properly
    
    # Compute similarities
    similarities = np.array([cosine_similarity_manual(query_embedding, np.array(embedding)) for embedding in questions_df['embedding']])
    
    # Get top k indices
    top_k_indices = similarities.argsort()[-k:][::-1]
    
    # Return the top k matched questions
    return questions_df.iloc[top_k_indices]['question'].tolist()

In [8]:
top_k_matched_questions("Simplify the expression 2x^2 - 8 / x - 2")

['7. Simplify: 8y -2-3(y-4)',
 '6. Use the distributive property to simplify. -3(x-10)+x',
 '3. Simplify: 6 – 2* 2 + (2^5)',
 '13. Solve: 3(x−5)  < x−8',
 '2. Simplify: 3+4*5 -6']