In [10]:
import logging
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import torch.nn as nn

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class EssayDataset(Dataset):
    """ Custom Dataset class for essays """
    def __init__(self, tokenizer, essays, max_length):
        self.tokenizer = tokenizer
        self.texts = essays
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

class BertRegressor(nn.Module):
    """ BERT Model for Regression Tasks """
    def __init__(self, pre_trained_model_name):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trained_model_name)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)  # Use 'out' to match the state dict

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.out(pooled_output)

try:
    # Parameters
    PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
    BATCH_SIZE = 16
    MAX_LEN = 256

    # Load Model
    model_path = r"C:\Users\nickr\OneDrive\Υπολογιστής\Repositories\Kaggle_Competitions\Learning Agency Lab - Automated Essay Scoring 2.0\bert_regressor.pth"
    model = BertRegressor(PRE_TRAINED_MODEL_NAME)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    logging.info("Model loaded and set to evaluation mode.")

    # Load Data
    test_data_path = r"C:\Users\nickr\OneDrive\Υπολογιστής\Repositories\Kaggle_Competitions\Learning Agency Lab - Automated Essay Scoring 2.0\learning-agency-lab-automated-essay-scoring-2\test.csv"
    test_df = pd.read_csv(test_data_path)
    logging.info("Test data loaded successfully.")

    # Data Processing
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    test_dataset = EssayDataset(tokenizer, test_df['full_text'].tolist(), MAX_LEN)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    logging.info("Data processing setup complete.")

    # Prediction and rounding off
    predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(torch.device('cpu')) for k, v in batch.items()}  # Ensure tensors are on the correct device
            outputs = model(**batch)
            rounded_outputs = torch.round(outputs.flatten())  # Round outputs to the nearest integer
            predictions.extend(rounded_outputs.tolist())
        test_df['score'] = predictions
    logging.info("Predictions made and assigned to DataFrame.")

    # Save Results
    output_file_path = r"C:\Users\nickr\OneDrive\Υπολογιστής\Repositories\Kaggle_Competitions\Learning Agency Lab - Automated Essay Scoring 2.0\learning-agency-lab-automated-essay-scoring-2\predictions.csv"
    test_df.to_csv(output_file_path, index=False)
    logging.info(f"Scores are saved to {output_file_path}.")

except Exception as e:
    logging.error(f"An error occurred: {str(e)}")

2024-04-22 21:28:28,664 - INFO - Model loaded and set to evaluation mode.
2024-04-22 21:28:28,668 - INFO - Test data loaded successfully.
2024-04-22 21:28:28,902 - INFO - Data processing setup complete.
2024-04-22 21:28:29,970 - INFO - Predictions made and assigned to DataFrame.
2024-04-22 21:28:30,054 - INFO - Scores are saved to C:\Users\nickr\OneDrive\Υπολογιστής\Repositories\Kaggle_Competitions\Learning Agency Lab - Automated Essay Scoring 2.0\learning-agency-lab-automated-essay-scoring-2\predictions.csv.


In [6]:
from transformers import BertTokenizer, BertModel

# For the first time only: run this on a machine with Internet access
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Save locally
tokenizer.save_pretrained('./local_bert_base_uncased_tokenizer/')
model.save_pretrained('./local_bert_base_uncased_model/')

# Then, in your submission notebook, load them locally without internet
tokenizer = BertTokenizer.from_pretrained('./local_bert_base_uncased_tokenizer/')
model = BertModel.from_pretrained('./local_bert_base_uncased_model/')


In [7]:
# Assuming 'model' is your trained model instance
model.save_pretrained('./my_model_directory/')

# To load the model from the local directory without internet
model = BertModel.from_pretrained('./my_model_directory/')
