In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from transformers import (
    GPT2Tokenizer, GPT2Model, BertTokenizer, BertModel,TrainingArguments,
    AutoTokenizer, AutoModelForSequenceClassification, Trainer,
    DistilBertTokenizer, DistilBertModel
)
# from pycaret.classification import *
# !pip install -U sentence-transformers
# from sentence_transformers import SentenceTransformer, InputExample, losses
# !pip install datasets==2.6.1
# from datasets import Dataset
from tqdm import tqdm
import requests
import os


In [2]:
try:
    response = requests.get("https://huggingface.co", timeout=5)
    print("Internet access is available!")
    ONLINE = True
except requests.exceptions.RequestException as e:
    print(f"No internet access: {e}")
    ONLINE = False


Internet access is available!


In [3]:
TRAIN = True
# Load the datasets
data_folder = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'# '/content/drive/MyDrive/eedi-mining-misconceptions-in-mathematics'
train_df = pd.read_csv(f'{data_folder}/train.csv')
test_df = pd.read_csv(f'{data_folder}/test.csv')
indicator_mapping = pd.read_csv(f'{data_folder}/misconception_mapping.csv')

In [4]:
def preprocess_dataframe(df, misconception=False):
  id_vars = ["QuestionText", 'QuestionId']
  var_name = 'Answer'
  answer_melted = df.melt(
      id_vars=id_vars,
      value_vars=[f'Answer{let}Text' for let in ['A', 'B', 'C', 'D']],
      value_name='AnswerText',
      var_name=var_name
  )
  for strr in ['Answer', 'Text']:
    answer_melted[var_name] = answer_melted[var_name].str.replace(strr, '')

  if misconception:
    misconception_melted = df.melt(
        id_vars=id_vars, value_vars=[f'Misconception{let}Id' for let in ['A', 'B', 'C', 'D']],
        value_name='MisconceptionId', var_name=var_name
    )
    for strr in ['Misconception', 'Id']:
      misconception_melted[var_name] = misconception_melted[var_name].str.replace(strr, '')

    df = pd.merge(answer_melted, misconception_melted, on=['QuestionText', var_name])
  else:
    df = answer_melted
  df['Q&A'] = df['QuestionText'] + ' ' + df['AnswerText']

  if misconception:
    df = pd.merge(df, indicator_mapping, on='MisconceptionId')

  df['label'] = 1 # 1 if the texts are related, 0 otherwise
  df = df.dropna(subset=['Q&A'])

  return df

In [5]:
# Define Dataset
class TextPairDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.qa_texts = dataframe['Q&A'].tolist()
        self.misconception_texts = dataframe['MisconceptionName'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.qa_texts)

    def __getitem__(self, idx):
        # Handle both single and batch indices
        if isinstance(idx, int):
            idx = [idx]  # Convert single index to list for uniform processing
    
        # Process batch
        qa_texts = [self.qa_texts[i] for i in idx]
        misconception_texts = [self.misconception_texts[i] for i in idx]
    
        # Tokenize batch of texts
        qa_encodings = self.tokenizer(qa_texts, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        misconception_encodings = self.tokenizer(misconception_texts, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
    
        # Return as a dictionary to avoid BatchEncoding issues
        return {
            "qa_input_ids": qa_encodings["input_ids"],
            "qa_attention_mask": qa_encodings["attention_mask"],
            "misconception_input_ids": misconception_encodings["input_ids"],
            "misconception_attention_mask": misconception_encodings["attention_mask"],
        }

# Define Contrastive Model
class ContrastiveLearningModel(nn.Module):
    def __init__(self, model_name):
        super(ContrastiveLearningModel, self).__init__()
        self.bert_model = DistilBertModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert_model.config.hidden_size, 128)  # Project to a smaller dimension

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.fc(cls_output)

# Define Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, embedding_a, embedding_b, label):
        # Label: 1 for similar, 0 for dissimilar
        cosine_similarity = nn.functional.cosine_similarity(embedding_a, embedding_b)
        loss = label * (1 - cosine_similarity) + (1 - label) * torch.clamp(cosine_similarity - self.margin, min=0)
        return loss.mean()

df = preprocess_dataframe(train_df, misconception=True)

# dataset_train = Dataset.from_pandas(df[:len(df)-200])
# dataset_eval = Dataset.from_pandas(df[len(df)-200:])
pre_trained_model_name = "distilbert-base-uncased"
local_pre_trained_name = f"./{pre_trained_model_name}"

# tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(pre_trained_model_name)

# if SAVE_PRETRAINED:
#     tokenizer.save_pretrained(local_pre_trained_name)
#     model.save_pretrained(local_pre_trained_name)
        
# Prepare DataLoader
# tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True)
# tokenized_dataset_eval = dataset_eval.map(tokenize_function, batched=True)
dataset = TextPairDataset(df, tokenizer)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
print('data loader is ready')

# Initialize Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ContrastiveLearningModel(pre_trained_model_name).to(device)
loss_fn = ContrastiveLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Training Loop
def train_contrastive_model(data_loader, model, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        qa_input_ids = batch["qa_input_ids"].squeeze(1).to(device)
        qa_attention_mask = batch["qa_attention_mask"].squeeze(1).to(device)
        misconception_input_ids = batch["misconception_input_ids"].squeeze(1).to(device)
        misconception_attention_mask = batch["misconception_attention_mask"].squeeze(1).to(device)
        
        # # qa_encodings, misconception_encodings = batch
        # input_ids_qa = qa_encodings['input_ids'].squeeze(1).to(device)
        # attention_mask_qa = qa_encodings['attention_mask'].squeeze(1).to(device)
        # input_ids_mis = misconception_encodings['input_ids'].squeeze(1).to(device)
        # attention_mask_mis = misconception_encodings['attention_mask'].squeeze(1).to(device)

        # Forward pass
        embeddings_qa = model(qa_input_ids, qa_attention_mask)
        embeddings_mis = model(misconception_input_ids, misconception_attention_mask)
        # Create labels: 1 for matched pairs
        labels = torch.ones(embeddings_qa.size(0)).to(device)
        loss = loss_fn(embeddings_qa, embeddings_mis, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

if TRAIN:
    # Train Model
    for epoch in range(1):  # Run for 5 epochs
        print(f'epoch {epoch} ...')
        avg_loss = train_contrastive_model(data_loader, model, loss_fn, optimizer, device)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

data loader is ready


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

epoch 0 ...


100%|██████████| 279/279 [01:36<00:00,  2.89it/s]

Epoch 1, Loss: 0.01280074413969285





In [10]:
if ONLINE:
    # Save tokenizer and model
    output_dir = "./fine_tuned_distilbert"
    tokenizer.save_pretrained("./distilbert_tokenizer")
    # Save the model's state_dict
    torch.save(model.state_dict(), './contrastive_learning_model.pth')

    # model.save_pretrained(output_dir)
    # # Load the fine-tuned model and tokenizer
    # tokenizer = AutoTokenizer.from_pretrained(local_pre_trained_name, local_files_only=True)
    # model_name = "/kaggle/working/fine_tuned_model"  # Path to your fine-tuned model
    # model = AutoModelForSequenceClassification.from_pretrained(model_name)
else:
    # Recreate the model instance
    model = ContrastiveLearningModel(pre_trained_model_name).to(device)
    
    # Load the saved state_dict into the model
    model.load_state_dict(torch.load('/kaggle/input/my_fine_tuned/transformers/default/1/contrastive_learning_model.pth'))
    
    # Set the model to evaluation mode (optional, if you're in inference mode)
    model.eval()

    # model_name = "/kaggle/input/my_fine_tuned/transformers/default/1/fine_tuned_distilbert"  # Path to your fine-tuned model
    # tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    # model = ContrastiveLearningModel(model_name).to(device)

    # model = AutoModelForSequenceClassification.from_pretrained(model_name)
    # tokenizer_path = "/kaggle/input/my_distilbert_model/pytorch/default/1/distilbert-base-uncased"
    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    

In [11]:
# Compute similarity with all misconception texts
misconception_embeddings = []
misconception_texts = indicator_mapping['MisconceptionName'].tolist()
miconception_ids = indicator_mapping['MisconceptionId'].tolist()

for text in tqdm(misconception_texts):
    encodings = tokenizer(text, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    misconception_embeddings.append(model(input_ids, attention_mask).detach())


100%|██████████| 2587/2587 [00:18<00:00, 141.33it/s]


In [12]:
# Inference: Find Top 25 Matches
def find_top_matches(test_qa, model, tokenizer, device, top_k=25):
    # Tokenize and encode the test Q&A
    test_encodings = tokenizer(test_qa, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
    input_ids = test_encodings['input_ids'].to(device)
    attention_mask = test_encodings['attention_mask'].to(device)
    
    test_embedding = model(input_ids, attention_mask).detach()

    
    similarities = [nn.functional.cosine_similarity(test_embedding, embedding, dim=1).item() for embedding in misconception_embeddings]
    sorted_indices = torch.argsort(torch.tensor(similarities), descending=True)
    
    return [miconception_ids[i] for i in sorted_indices[:top_k]]


# Test with a new Q&A
test_qa = "Sample question and answer text here."
top_matches = find_top_matches(test_qa, model, tokenizer, device)
print("Top Matches:", top_matches)


Top Matches: [114, 2136, 2409, 261, 791, 2182, 793, 329, 2106, 2475, 930, 388, 1544, 1619, 955, 2284, 1432, 2187, 824, 2406, 2162, 285, 1765, 2112, 2130]


In [17]:
# Inference loop
# prepare test data
df_test_prep = preprocess_dataframe(test_df, misconception=False)

tqdm.pandas()
# Append the predictions to the original DataFrame
df_test_prep['MisconceptionId'] = df_test_prep.progress_apply(
    lambda x: list(find_top_matches(x['Q&A'], model, tokenizer, device)), axis=1
)

# Print the DataFrame with predictions
# display(df_test_prep)

df_test_prep['QuestionId_Answer'] = df_test_prep['QuestionId'].astype(str) + '_' + df['Answer']
# df_test_prep = df_test_prep.sort_values(by=['QuestionId_Answer', 'prediction'], ascending=[True, False])
# df_out = df_test_prep.groupby(['QuestionId_Answer', 'Answer']).apply(
#     lambda x: x['MisconceptionId'].head(25).tolist()
# ).reset_index().rename(columns={0: 'MisconceptionId'})
df_out = df_test_prep
display(df_out)
df_out[['QuestionId_Answer', 'MisconceptionId']].to_csv('./v11_submission.csv', index=False)


100%|██████████| 12/12 [00:03<00:00,  3.67it/s]


Unnamed: 0,QuestionText,QuestionId,Answer,AnswerText,Q&A,label,MisconceptionId,QuestionId_Answer
0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1869,A,\( 3 \times(2+4)-5 \),\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1,"[94, 303, 839, 429, 937, 480, 994, 519, 1342, ...",1869_A
1,"Simplify the following, if possible: \( \frac{...",1870,A,\( m+1 \),"Simplify the following, if possible: \( \frac{...",1,"[429, 2469, 925, 1360, 811, 2116, 272, 2139, 3...",1870_A
2,Tom and Katie are discussing the \( 5 \) plant...,1871,A,Only\nTom,Tom and Katie are discussing the \( 5 \) plant...,1,"[584, 480, 566, 1413, 2137, 2564, 2251, 2293, ...",1871_A
3,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1869,B,\( 3 \times 2+(4-5) \),\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1,"[429, 2475, 1042, 265, 1759, 1982, 2000, 2221,...",1869_A
4,"Simplify the following, if possible: \( \frac{...",1870,B,\( m+2 \),"Simplify the following, if possible: \( \frac{...",1,"[429, 620, 2174, 1239, 2400, 925, 887, 363, 20...",1870_A
5,Tom and Katie are discussing the \( 5 \) plant...,1871,B,Only\nKatie,Tom and Katie are discussing the \( 5 \) plant...,1,"[2096, 908, 57, 2257, 2385, 1196, 475, 179, 21...",1871_A
6,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1869,C,\( 3 \times(2+4-5) \),\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1,"[736, 429, 2446, 1342, 303, 546, 2335, 711, 16...",1869_A
7,"Simplify the following, if possible: \( \frac{...",1870,C,\( m-1 \),"Simplify the following, if possible: \( \frac{...",1,"[429, 329, 1432, 1991, 578, 1821, 2054, 405, 8...",1870_A
8,Tom and Katie are discussing the \( 5 \) plant...,1871,C,Both Tom and Katie,Tom and Katie are discussing the \( 5 \) plant...,1,"[272, 937, 887, 2068, 1536, 2372, 2096, 2385, ...",1871_A
9,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1869,D,Does not need brackets,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,1,"[1110, 1994, 285, 2068, 1817, 889, 365, 1042, ...",1869_A


In [None]:
print(os.listdir("/kaggle/input/"))
print(os.listdir("/kaggle/input/my_distilbert_model/pytorch/default/1/distilbert-base-uncased"))