In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# Install required packages
!pip install -q -U immutabledict sentencepiece
!pip install -U transformers
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension


import os
import shutil
import torch
import contextlib
import pandas as pd
import numpy as np

# Paths for model storage and repository cloning
working_dir = '/kaggle/working'
submission_dir = '/kaggle/working/submission/lib/gemma'
repo_url = 'https://github.com/google/gemma_pytorch.git'
weights_dir = '/kaggle/input/gemma/pytorch/2b-it/2/'  # Adjust according to weight locations

# Remove existing directories if they exist
for path in [f'{working_dir}/gemma_pytorch', submission_dir]:
    if os.path.exists(path):
        shutil.rmtree(path)

# Clone the repository
!git clone {repo_url} > /dev/null

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Create necessary directories and move files
os.makedirs(submission_dir, exist_ok=True)
shutil.move(f'{working_dir}/gemma_pytorch/gemma', submission_dir)

# Check if files were moved correctly
print("Files in gemma directory:", os.listdir(submission_dir))
print("Files in the weights directory:", os.listdir(weights_dir))

# Set up system path
import sys
sys.path.append(submission_dir)

from gemma.config import GemmaConfig, get_config_for_7b, get_config_for_2b
from gemma.model import GemmaForCausalLM
from gemma.tokenizer import Tokenizer

# Function to set default tensor type
@contextlib.contextmanager
def _set_default_tensor_type(dtype: torch.dtype):
    torch.set_default_dtype(dtype)
    yield
    torch.set_default_dtype(torch.float)


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Cloning into 'gemma_pytorch'...
remote: Enumerating objects: 239, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 239 (delta 86), reused 58 (delta 55), pack-reused 116 (from 1)[K
Receiving objects: 100% (239/239), 2.18 MiB | 8.07 MiB/s, done.
Resolving deltas: 100% (135/135), done.
Files in gemma directory: ['gemma']
Files in the weights directory: ['config.json', 'gemma-2b-it.ckpt', 'tokenizer.model']


In [5]:
# Define model variant and machine type (using 'cpu' here)
VARIANT = "2b-it"
MACHINE_TYPE = "cpu"

# Load model configuration for 2b variant (since 7b is not available in your directory)
model_config = get_config_for_2b()

# Tokenizer path setup
model_config.tokenizer = os.path.join(weights_dir, "tokenizer.model")
if not os.path.isfile(model_config.tokenizer):
    raise FileNotFoundError(f"Tokenizer model not found at {model_config.tokenizer}")

device = torch.device(MACHINE_TYPE)

# Load model weights and set to evaluation mode
with _set_default_tensor_type(model_config.get_dtype()):
    model = GemmaForCausalLM(model_config)
    
    # Update the checkpoint path for the 2b model
    ckpt_path = os.path.join(weights_dir, f'gemma-{VARIANT}.ckpt')
    
    # Load the model weights
    model.load_weights(ckpt_path)
    model = model.to(device).eval()

print(f"Model {VARIANT} loaded successfully and is now in evaluation mode.")

Model 2b-it loaded successfully and is now in evaluation mode.


In [6]:
print(dir(model.tokenizer))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'bos_id', 'decode', 'encode', 'eos_id', 'n_words', 'pad_id', 'sp_model']


In [7]:
import pandas as pd

# Define the mapping DataFrame
data = {
    'QuestionId_Answer': [
        '1869_A', '1869_B', '1869_C', '1870_B'
    ],
    'MisconceptionId': [
        '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25',
        '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25',
        '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25',
        '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25'
    ]
}

mapping_df = pd.DataFrame(data)

# Verify the DataFrame
print(mapping_df)

  QuestionId_Answer                                    MisconceptionId
0            1869_A  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1            1869_B  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
2            1869_C  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3            1870_B  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...


In [8]:
import pandas as pd
import re
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the data
train_data = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_data = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
misconception_mapping = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')
sample_submission = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv')

# Check the shape of the data
print("Train data shape before processing:", train_data.shape)

# Reset index to ensure unique index values
train_data.reset_index(drop=True, inplace=True)

# Fill missing values in misconception IDs with a placeholder and convert to int
train_data[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']] = train_data[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].fillna(-1).astype(int)

# Combine all misconception IDs into one column for one-hot encoding
all_misconceptions = pd.concat([train_data[['MisconceptionAId']], train_data[['MisconceptionBId']], train_data[['MisconceptionCId']], train_data[['MisconceptionDId']]], axis=0)

# One-hot encode the combined misconception IDs
one_hot = pd.get_dummies(all_misconceptions, prefix='MisconceptionId', drop_first=False)

# Aggregate one-hot encoded columns and join them back to the original DataFrame
one_hot_encoded = one_hot.groupby(level=0).max()
train_data = pd.concat([train_data, one_hot_encoded], axis=1)

# Drop original columns after one-hot encoding
train_data.drop(columns=['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'], inplace=True)

# Check the shape after processing
print("Train data shape after processing:", train_data.shape)

# Define or load your model and tokenizer
model_name = "google/gemma-2b-it"  # Replace with the actual model name you are using
model = AutoModelForCausalLM.from_pretrained(model_name, token="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")
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")  # Replace with the actual tokenizer name if different

# Define device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Define the generate_predictions function
def generate_predictions(model, question_row, device, max_new_tokens=50):
    # Combine the question and answer options into the prompt
    prompt = f"<start_of_turn>user\nQuestion: {question_row['QuestionText']}\n" \
             f"Options: A) {question_row['AnswerAText']} B) {question_row['AnswerBText']} " \
             f"C) {question_row['AnswerCText']} D) {question_row['AnswerDText']}\n<end_of_turn>\n" \
             "<start_of_turn>model\n"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    
    # Generate the output using the model
    try:
        output = model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens)
    except TypeError as e:
        print(f"Error with generate method: {e}")
        return None

    # Decode the output
    try:
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    except TypeError as e:
        print(f"Error with tokenizer.decode method: {e}")
        return None

    return decoded_output.strip()  # Strip any extraneous whitespace

# Generate predictions for each row in the test data
predictions = {}

for index, row in test_data.iterrows():
    # Iterate over each answer option
    for option in ['A', 'B', 'C', 'D']:
        # Create the column name based on the option
        answer_column = f'Answer{option}Text'
        # Construct the question_id_answer string
        question_id_answer = f"{row['QuestionId']}_{option}"
        
        # Generate predictions
        predicted_output = generate_predictions(model, row, device)
        
        # Store predictions
        predictions[question_id_answer] = predicted_output

# Print the first few predictions to debug
print("First few predictions:", dict(list(predictions.items())[:10]))

# Define a simplified map_predictions function
def map_predictions(predictions, misconception_mapping):
    """
    Map predictions to MisconceptionIds based on available columns in the misconception_mapping DataFrame.

    Args:
    predictions (dict): Dictionary with QuestionId_Answer as keys and list of predicted MisconceptionIds as values.
    misconception_mapping (pd.DataFrame): DataFrame with MisconceptionId and MisconceptionName.

    Returns:
    dict: Dictionary with QuestionId_Answer and space-delimited MisconceptionIds.
    """
    mapped_predictions = {}
    
    for qid_answer, preds in predictions.items():
        # Use a placeholder if mapping isn't available
        mapped_predictions[qid_answer] = ' '.join(map(str, preds))  # Join the list of predicted MisconceptionIds into a single string
    
    return mapped_predictions

# Apply the updated mapping function to predictions
mapped_predictions = map_predictions(predictions, misconception_mapping)

# Fill missing predictions with all misconception IDs as a placeholder (adjust if needed)
default_misconception = ' '.join(map(str, range(1, 26)))  # All misconception IDs from 1 to 25

all_predictions = {f"{row['QuestionId']}_{option}": default_misconception
                    for _, row in test_data.iterrows() 
                    for option in ['A', 'B', 'C', 'D']}

# Update with available predictions
all_predictions.update(mapped_predictions)

# Prepare predictions for submission
misconception_ids = [all_predictions.get(f"{row['QuestionId']}_{option}", default_misconception) 
                     for _, row in test_data.iterrows() 
                     for option in ['A', 'B', 'C', 'D']]

# Adjust for length mismatch
if len(misconception_ids) < len(sample_submission):
    misconception_ids.extend([default_misconception] * (len(sample_submission) - len(misconception_ids)))
elif len(misconception_ids) > len(sample_submission):
    misconception_ids = misconception_ids[:len(sample_submission)]

# Update the sample_submission DataFrame
sample_submission['MisconceptionId'] = misconception_ids

# Save the updated submission file
sample_submission.to_csv('submission.csv', index=False)

Train data shape before processing: (1869, 15)
Train data shape after processing: (1869, 11)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

First few predictions: {'1869_A': 'user\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nOptions: A) \\( 3 \\times(2+4)-5 \\) B) \\( 3 \\times 2+(4-5) \\) C) \\( 3 \\times(2+4-5) \\) D) Does not need brackets\n\nmodel\nThe correct option is A) \\( 3 \\times(2+4)-5 \\)\n\nThe brackets need to be placed to group the numbers 2 and 4 together, so that the multiplication operation is performed before the subtraction operation.', '1869_B': 'user\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nOptions: A) \\( 3 \\times(2+4)-5 \\) B) \\( 3 \\times 2+(4-5) \\) C) \\( 3 \\times(2+4-5) \\) D) Does not need brackets\n\nmodel\nThe correct option is A) \\( 3 \\times(2+4)-5 \\)\n\nThe brackets need to be placed to group the numbers 2 and 4 together, so that the multiplication operation is performed before the subtraction operation.', '1869_C': 'user\nQuestion: \\[\n3 \\times 2+4-