In [24]:
import os
os.environ["OKENIZERS_PARALLELISM"]= "false"

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import re

# Initialize the tokenizer and model from the transformers library
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # You can choose a different model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example function to preprocess text (clean up text using regex)
def preprocess_text(text):
    # Remove unwanted characters and lower the case
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and special characters
    return text

# Function to get embeddings from the model
def get_embeddings(texts):
    # Tokenize input texts (batch of texts)
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    
    # Move inputs to the device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get model output (embeddings)
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**inputs)
    
    # Extract the hidden states (or use `outputs.pooler_output` if using a sentence transformer)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Averaging over tokens to get sentence embedding

    # Convert embeddings to float32 to avoid BFloat16 issue
    embeddings = embeddings.to(torch.float32)  
    return embeddings.cpu().numpy()  # Move embeddings back to CPU and convert to numpy array

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [3]:
import pandas as pd

# Load the datasets
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
sample_submission = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv")

# Display the first few rows of each DataFrame to verify the data
print("Train DataFrame:")
print(train.head())
print("\nTest DataFrame:")
print(test.head())
print("\nMisconception Mapping DataFrame:")
print(misconception_mapping.head())
print("\nSample Submission DataFrame:")
print(sample_submission.head())

Train DataFrame:
   QuestionId  ConstructId                                      ConstructName  \
0           0          856  Use the order of operations to carry out calcu...   
1           1         1612  Simplify an algebraic fraction by factorising ...   
2           2         2774            Calculate the range from a list of data   
3           3         2377  Recall and use the intersecting diagonals prop...   
4           4         3387  Substitute positive integer values into formul...   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   
3         88                       Properties of Quadrilaterals             C   
4         67                          Substitution into Formula             A   

         

In [4]:
import pandas as pd
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    # Combine ConstructName and QuestionText
    df["all_question_text"] = df["ConstructName"] + " " + df["QuestionText"]
    # Apply preprocessing to the combined text
    df["all_question_text"] = df["all_question_text"].apply(preprocess_text)
    return df

# Apply the function to the test DataFrame
test = make_all_question_text(test)

# Display the modified test DataFrame
print(test.head())

   QuestionId  ConstructId                                      ConstructName  \
0        1869          856  Use the order of operations to carry out calcu...   
1        1870         1612  Simplify an algebraic fraction by factorising ...   
2        1871         2774            Calculate the range from a list of data   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   

                                        QuestionText            AnswerAText  \
0  \[\n3 \times 2+4-5\n\]\nWhere do the brackets ...  \( 3 \times(2+4)-5 \)   
1  Simplify the following, if possible: \( \frac{...              \( m+1 \)   
2  Tom and Katie are discussing the \( 5 \) plant...              Only\nTom   

              AnswerBText        

In [5]:
import pandas as pd

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the DataFrame from wide to long format
    df_long = pd.melt(
        df[[
            "QuestionId",
            "all_question_text",
            "CorrectAnswer",
            "AnswerAText",
            "AnswerBText",
            "AnswerCText",
            "AnswerDText"
        ]],
        id_vars=["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name='Answer',
        value_name='value'
    )
    return df_long

# Convert the test DataFrame to long format
test_long = wide_to_long(test)

# Display the first few rows of the long format DataFrame
print(test_long.head())

   QuestionId                                  all_question_text  \
0        1869  use the order of operations to carry out calcu...   
1        1870  simplify an algebraic fraction by factorising ...   
2        1871  calculate the range from a list of data tom an...   
3        1869  use the order of operations to carry out calcu...   
4        1870  simplify an algebraic fraction by factorising ...   

  CorrectAnswer       Answer                   value  
0             A  AnswerAText   \( 3 \times(2+4)-5 \)  
1             D  AnswerAText               \( m+1 \)  
2             B  AnswerAText               Only\nTom  
3             A  AnswerBText  \( 3 \times 2+(4-5) \)  
4             D  AnswerBText               \( m+2 \)  


In [6]:
import pandas as pd

def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    text_components = []
    
    # Append existing text components if they exist
    if "all_question_text" in df.columns:
        text_components.append(df["all_question_text"])
    if "value" in df.columns:
        text_components.append(df["value"].apply(preprocess_text))
    
    # Combine text components into a single 'all_text' column
    df["all_text"] = pd.concat(text_components, axis=1).apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    return df

# Apply the function to the long format DataFrame
test_long = make_all_text(test_long)

# Sort the DataFrame by 'QuestionId' and 'Answer', then reset the index
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)

# Display the first few rows of the resulting DataFrame
print(test_long.head())

   QuestionId                                  all_question_text  \
0        1869  use the order of operations to carry out calcu...   
1        1869  use the order of operations to carry out calcu...   
2        1869  use the order of operations to carry out calcu...   
3        1869  use the order of operations to carry out calcu...   
4        1870  simplify an algebraic fraction by factorising ...   

  CorrectAnswer       Answer                   value  \
0             A  AnswerAText   \( 3 \times(2+4)-5 \)   
1             A  AnswerBText  \( 3 \times 2+(4-5) \)   
2             A  AnswerCText   \( 3 \times(2+4-5) \)   
3             A  AnswerDText  Does not need brackets   
4             D  AnswerAText               \( m+1 \)   

                                            all_text  
0  use the order of operations to carry out calcu...  
1  use the order of operations to carry out calcu...  
2  use the order of operations to carry out calcu...  
3  use the order of operations to 

In [7]:
# Sort the DataFrame by 'QuestionId' and 'Answer', then reset the index
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)

# Display the sorted DataFrame
print(test_long)

    QuestionId                                  all_question_text  \
0         1869  use the order of operations to carry out calcu...   
1         1869  use the order of operations to carry out calcu...   
2         1869  use the order of operations to carry out calcu...   
3         1869  use the order of operations to carry out calcu...   
4         1870  simplify an algebraic fraction by factorising ...   
5         1870  simplify an algebraic fraction by factorising ...   
6         1870  simplify an algebraic fraction by factorising ...   
7         1870  simplify an algebraic fraction by factorising ...   
8         1871  calculate the range from a list of data tom an...   
9         1871  calculate the range from a list of data tom an...   
10        1871  calculate the range from a list of data tom an...   
11        1871  calculate the range from a list of data tom an...   

   CorrectAnswer       Answer                   value  \
0              A  AnswerAText   \( 3 \times(2

In [8]:
# Access the value of 'all_text' at index 3
all_text_value = test_long["all_text"].iloc[3]

# Display the value
print(all_text_value)

use the order of operations to carry out calculations involving powers times where do the brackets need to go to make the answer equal does not need brackets


In [9]:
labels = misconception_mapping.loc[:, 'MisconceptionName'].values

In [10]:
from transformers import AutoTokenizer, AutoModel
import torch

# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer from specified directory
model_path = '/kaggle/input/bge-small-en-v1.5/transformers/bge/2'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Set the model to evaluation mode and move it to the device
model.eval()
model.to(device)

print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


In [12]:
import numpy as np
import torch

def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    all_embeddings = []
    
    # Process texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract embeddings for the CLS token and convert to float32
        embeddings = outputs.last_hidden_state[:, 0, :].to(torch.float32)
        
        # Normalize embeddings
        normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
        # Store the embeddings
        all_embeddings.append(normalized_embeddings.cpu().numpy())
    
    # Concatenate all embeddings into a single array
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for test texts
test_texts = test_long['all_text'].tolist()  # Convert to list for processing
all_text_vector = generate_embeddings(test_texts, model, tokenizer, device)

# Generate embeddings for misconception names
misconception_names = misconception_mapping['MisconceptionName'].tolist()  # Convert to list
all_ctx_vector = generate_embeddings(misconception_names, model, tokenizer, device)

In [13]:
# Calculate cosine similarity between the text embeddings and context embeddings
test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)

# Get the sorted indices of the cosine similarity scores in descending order
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [18]:
# Get the top 25 sorted indices for each row
top_25_sorted_indices = test_sorted_indices[:, :25]

In [19]:
# Extract the alphabet corresponding to the answer
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')

# Create a unique identifier for each QuestionId and Answer
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype(str) + "_" + test_long["Answer_alphabet"]

# Convert the sorted indices into a list for MisconceptionId
misconception_ids = test_sorted_indices[:, :25].tolist()

# Ensure we have the right length
misconception_ids_flat = [misconception_ids[i][:25] for i in range(len(test_long))]

# Assign the MisconceptionId to the DataFrame
test_long["MisconceptionId"] = [' '.join(map(str, ids)) for ids in misconception_ids_flat]

# Filter out rows where the correct answer matches the selected answer
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]

# Prepare the submission DataFrame with relevant columns
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

# Save the submission DataFrame to a CSV file
submission.to_csv('/kaggle/working/submission.csv', index=False)