In [None]:
import kagglehub
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

# Initialize distributed process
def init_distributed(rank, world_size):
    os.environ["MASTER_ADDR"] = "192.168.1.1"
    os.environ["MASTER_PORT"] = "12355"
    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

# Download dataset
path = kagglehub.dataset_download("mazlumi/ielts-writing-scored-essays-dataset")
print("Path to dataset files:", path)

# Check the files in the directory
for root, dirs, files in os.walk(path):
    print(f"Root: {root}")
    print(f"Directories: {dirs}")
    print(f"Files: {files}")

# Correct file name
csv_file = os.path.join(path, "ielts_writing_dataset.csv")

# Load the dataset into a DataFrame
try:
    df = pd.read_csv(csv_file)
    print("Dataset loaded successfully!")
    print(df.head())  # Display first few rows
except FileNotFoundError:
    print(f"File not found: {csv_file}. Please verify the dataset structure.")

# Drop rows where 'Overall' column is NaN
df = df.dropna(subset=["Overall"])

# Combine Question and Essay for model input
df["combined_text"] = df["Question"] + " " + df["Essay"]

# Check dataset size after dropping rows
print(f"Dataset size after dropping rows with NaN in 'Overall': {df.shape}")

# Train-test split
train_texts, temp_texts, train_scores, temp_scores = train_test_split(
    df["combined_text"], df["Overall"], test_size=0.2, random_state=42
)
val_texts, test_texts, val_scores, test_scores = train_test_split(
    temp_texts, temp_scores, test_size=0.5, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

class EssayDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, scores):
        self.encodings = encodings
        self.scores = scores

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

# Create datasets
train_dataset = EssayDataset(train_encodings, train_scores.tolist())
val_dataset = EssayDataset(val_encodings, val_scores.tolist())
test_dataset = EssayDataset(test_encodings, test_scores.tolist())

# Distributed setup
rank = int(os.environ["0"])
world_size = int(os.environ["2"])
init_distributed(rank, world_size)

# Initialize model
device = torch.device(f"cuda:{rank}")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1).to(device)

# Wrap the model in DistributedDataParallel
model = DDP(model, device_ids=[rank])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    local_rank=rank,  # Add local rank for distributed training
)

# Define custom compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    return {"rmse": rmse, "mae": mae, "r2": r2}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
if rank == 0:  # Only the master node handles logging
    trainer.train()

# Evaluate on validation set
results = trainer.evaluate()
if rank == 0:
    print("Validation Results:", results)

# Shutdown distributed processes
dist.destroy_process_group()


ModuleNotFoundError: No module named 'datasets'