In [1]:
import numpy as np
# !pip install openai
# from openai import ChatCompletion
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from transformers import (
    AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments
)
import datasets
from tqdm import tqdm
import requests
import os
import re

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/kaggle-eedi/sample_submission.csv
/kaggle/input/kaggle-eedi/misconception_mapping.csv
/kaggle/input/kaggle-eedi/train.csv
/kaggle/input/kaggle-eedi/test.csv


In [2]:
try:
    response = requests.get("https://huggingface.co", timeout=5)
    print("Internet access is available!")
    ONLINE = True
except requests.exceptions.RequestException as e:
    print(f"No internet access: {e}")
    ONLINE = False


Internet access is available!


In [3]:
TRAIN = True
SAVE_PRETRAINED = True
# Load the datasets
input_folder = '/kaggle/input/'

data_folder = input_folder + '/kaggle-eedi'# '/content/drive/MyDrive/eedi-mining-misconceptions-in-mathematics'
train_df = pd.read_csv(f'{data_folder}/train.csv')
test_df = pd.read_csv(f'{data_folder}/test.csv')
misconception_mapping = pd.read_csv(f'{data_folder}/misconception_mapping.csv')
# Create a dictionary to map MisconceptionId to its name
misconception_dict = dict(zip(
    misconception_mapping["MisconceptionId"], 
    misconception_mapping["MisconceptionName"]
))

In [4]:
# All Known Misconceptions:
#     {misconception_dict}
        
# Function to create a prompt for a single question
def generate_input_prompt(row):
    question_text = row["QuestionText"]
    options = [
        f"A: {row['AnswerAText']}",
        f"B: {row['AnswerBText']}",
        f"C: {row['AnswerCText']}",
        f"D: {row['AnswerDText']} (Correct)"
    ]
    # Match misconceptions to distractors in a multiple-choice Diagnostic Question.
    prompt = f"""
        Question:
        {question_text}
        
        Options:
        {options[0]}
        {options[1]}
        {options[2]}
        {options[3]}
        
        Correct Answer:
        {row['CorrectAnswer']}
        
        Task: Predict the most likely misconception IDs for each distractor (B, C, D) and rank them by probability.
        
        Output format:
        A - MisconceptionId - MisconceptionName, if it is not the correct answer
        B - MisconceptionId - MisconceptionName, if it is not the correct answer
        C - MisconceptionId - MisconceptionName, if it is not the correct answer
        D - MisconceptionId - MisconceptionName, if it is not the correct answer
    """
    return prompt


def generate_output_prompt(row):
    prompt = """"""
    for ans in list(set(['A', 'B', 'C', 'D'])-set(row['CorrectAnswer'])):
        misconc_id = row[f'Misconception{ans}Id']
        # print("misconc_id = ", misconc_id)
        try:
            prompt += f"""{ans} - {misconc_id} - {misconception_dict[misconc_id]}
                """
        except:
            pass
        # else:
        #     prompt += f"""{ans} - Unavailable
        #         """
        
    return prompt

tqdm.pandas()
train_df['input'] = train_df.progress_apply(generate_input_prompt, axis=1)
train_df['output'] = train_df.progress_apply(generate_output_prompt, axis=1)


100%|██████████| 1869/1869 [00:00<00:00, 47355.05it/s]
100%|██████████| 1869/1869 [00:00<00:00, 54941.05it/s]


In [None]:
model_name = "EleutherAI/gpt-neo-125M"
# Load and tokenize the dataset
# train_data = datasets.load_dataset("csv", data_files=f'{data_folder}/train.csv')["train"]
train_data = datasets.Dataset.from_pandas(train_df)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Assign the existing `eos_token` as the `pad_token`
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def preprocess_function(examples):
    return tokenizer(
        examples["input"], 
        text_target=examples["output"], 
        max_length=512, 
        padding="max_length",
        truncation=True
    )

tokenized_data = train_data.map(preprocess_function, batched=True)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    per_device_train_batch_size=8,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Map:   0%|          | 0/1869 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


In [None]:
# # Function to predict misconceptions using the OpenAI API
# def predict_misconceptions(test_df, misconception_dict):
#     predictions = []
    
#     for _, row in test_df.iterrows():
#         prompt = generate_prompt(row, misconception_dict)
        
#         try:
#             # Call OpenAI API
#             response = ChatCompletion.create(
#                 model="gpt-4",
#                 messages=[{"role": "user", "content": prompt}]
#             )
#             output = response["choices"][0]["message"]["content"].strip()
#             predictions.append((row["QuestionId"], output))
        
#         except Exception as e:
#             print(f"Error processing QuestionId {row['QuestionId']}: {e}")
#             predictions.append((row["QuestionId"], "Error"))
    
#     return predictions

# # Save predictions to the submission format
# def save_submission(predictions, filename="submission.csv"):
#     submission_df = pd.DataFrame(predictions, columns=["QuestionId_Answer", "MisconceptionId"])
#     submission_df.to_csv(filename, index=False)
#     print(f"Submission saved to {filename}")

# # Main logic
# # if __name__ == "__main__":
# # Generate predictions
# predictions = predict_misconceptions(test_df, misconception_dict)

# # Save to submission file
# save_submission(predictions)
