# 00 - Environment Setup and Configuration
This notebook is responsible for:
- Recording environment and dependency versions
- Setting random seeds to ensure reproducibility
- Detecting GPU information
- Creating the project directory structure
- Saving configuration information

## 1. Import Required Libraries

In [1]:
import sys
import os
import random
import numpy as np
import torch
import json
from datetime import datetime
from pathlib import Path

import transformers
import datasets
import evaluate
import peft
import accelerate

## 2. Record Environment Information

In [2]:
# Collect Environment Information
env_info = {
    "python_version": sys.version,
    "pytorch_version": torch.__version__,
    "transformers_version": transformers.__version__,
    "datasets_version": datasets.__version__,
    "evaluate_version": evaluate.__version__,
    "peft_version": peft.__version__,
    "accelerate_version": accelerate.__version__,
    "cuda_available": torch.cuda.is_available(),
    "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
    "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

# Print environment information
for key, value in env_info.items():
    print(f"{key}: {value}")

python_version: 3.10.18 (main, Jun  5 2025, 13:14:17) [GCC 11.2.0]
pytorch_version: 2.6.0+cu124
transformers_version: 4.57.1
datasets_version: 4.2.0
evaluate_version: 0.4.6
peft_version: 0.17.1
accelerate_version: 1.10.1
cuda_available: True
cuda_version: 12.4
device_count: 1
timestamp: 2025-10-17 22:13:29


## 3. GPU Information Detection

In [3]:
# Detailed GPU Information
if torch.cuda.is_available():
    print("GPU Information:")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i}:")
        print(f"  Name: {torch.cuda.get_device_name(i)}")
        print(f"  Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
        print(f"  Compute Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
    
    # Current memory usage
    print(f"\nCurrently Allocated Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"Currently Reserved Memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
else:
    print("No GPU detected. The CPU will be used for training (which may be slower).")

# Set default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nDefault Device: {device}")


GPU Information:
Number of GPUs: 1

GPU 0:
  Name: Tesla V100-SXM2-32GB
  Total Memory: 31.73 GB
  Compute Capability: 7.0

Currently Allocated Memory: 0.00 GB
Currently Reserved Memory: 0.00 GB

Default Device: cuda


## 4. Set Random Seeds (Ensure Reproducibility)

In [4]:
# Fix random seed
RANDOM_SEED = 42

def set_seed(seed=RANDOM_SEED):
    """Set all random seeds to ensure reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    # Ensure deterministic behavior in PyTorch
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set environment variable
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(RANDOM_SEED)
print(f"Random seed has been set to: {RANDOM_SEED}")
print("This ensures experiment reproducibility.")

Random seed has been set to: 42
This ensures experiment reproducibility.


## 5. Create Project Directory Structure

In [5]:
# Define project directory structure
PROJECT_ROOT = Path(".")
DIRS = {
    "data": PROJECT_ROOT / "data",
    "data_processed": PROJECT_ROOT / "data" / "processed",
    "outputs": PROJECT_ROOT / "outputs",
    "outputs_roberta_full": PROJECT_ROOT / "outputs" / "roberta_full",
    "outputs_roberta_lora": PROJECT_ROOT / "outputs" / "roberta_lora",
    "outputs_distilbert_full": PROJECT_ROOT / "outputs" / "distilbert_full",
    "outputs_distilbert_lora": PROJECT_ROOT / "outputs" / "distilbert_lora",
    "logs": PROJECT_ROOT / "logs",
    "reports": PROJECT_ROOT / "reports",
    "configs": PROJECT_ROOT / "configs"
}

# Create all directories
for name, path in DIRS.items():
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created directory: {path}")

print("\nProject directory structure has been successfully created.")


Created directory: data
Created directory: data/processed
Created directory: outputs
Created directory: outputs/roberta_full
Created directory: outputs/roberta_lora
Created directory: outputs/distilbert_full
Created directory: outputs/distilbert_lora
Created directory: logs
Created directory: reports
Created directory: configs

Project directory structure has been successfully created.


## 6. Define Project Configuration Parameters

In [6]:
# Project configuration
PROJECT_CONFIG = {
    # Basic information
    "project_name": "Legal QA Fine-tuning",
    "random_seed": RANDOM_SEED,
    "device": str(device),
    
    # Data configuration
    "dataset_name": "isaacus/LegalQAEval",
    "train_split_ratio": 0.9,
    "valid_split_ratio": 0.1,
    
    # Model configuration
    "models": {
        "roberta": "roberta-base",
        "distilbert": "distilbert-base-uncased"
    },
    
    # Preprocessing configuration
    "max_length": 384,  # Common maximum length for QA tasks
    "doc_stride": 128,
    "max_answer_length": 30,
    
    # Training configuration
    "training": {
        "learning_rate": 3e-5,
        "batch_size": 8,
        "num_epochs": 3,
        "warmup_steps": 500,
        "weight_decay": 0.01,
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "logging_steps": 100,
        "fp16": torch.cuda.is_available()  # Use mixed precision only when GPU is available
    },
    
    # LoRA configuration
    "lora": {
        "r": 8,
        "lora_alpha": 16,
        "target_modules": ["query", "value"],
        "lora_dropout": 0.05,
        "bias": "none",
        "task_type": "QUESTION_ANS"
    },
    
    # Directory paths
    "paths": {k: str(v) for k, v in DIRS.items()}
}

print("Project Configuration Parameters:")
print(json.dumps(PROJECT_CONFIG, indent=2, ensure_ascii=False))

Project Configuration Parameters:
{
  "project_name": "Legal QA Fine-tuning",
  "random_seed": 42,
  "device": "cuda",
  "dataset_name": "isaacus/LegalQAEval",
  "train_split_ratio": 0.9,
  "valid_split_ratio": 0.1,
  "models": {
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
  },
  "max_length": 384,
  "doc_stride": 128,
  "max_answer_length": 30,
  "training": {
    "learning_rate": 3e-05,
    "batch_size": 8,
    "num_epochs": 3,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "logging_steps": 100,
    "fp16": true
  },
  "lora": {
    "r": 8,
    "lora_alpha": 16,
    "target_modules": [
      "query",
      "value"
    ],
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "QUESTION_ANS"
  },
  "paths": {
    "data": "data",
    "data_processed": "data/processed",
    "outputs": "outputs",
    "outputs_roberta_full": "outputs/roberta_full",
    "outputs_roberta_lora": "o

## 7. Save Configuration to File

In [7]:
# Save project configuration file
config_path = DIRS["configs"] / "project_config.json"
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(PROJECT_CONFIG, f, indent=2, ensure_ascii=False)
print(f"Configuration has been saved to: {config_path}")

# Save environment information
env_path = DIRS["configs"] / "environment_info.json"
with open(env_path, 'w', encoding='utf-8') as f:
    json.dump(env_info, f, indent=2, ensure_ascii=False)
print(f"Environment information has been saved to: {env_path}")

Configuration has been saved to: configs/project_config.json
Environment information has been saved to: configs/environment_info.json


## 8. Create Utility Functions File

In [8]:
# Create a general utility functions file for use in later notebooks
utils_code = '''
"""General Utility Functions"""
import json
import torch
from pathlib import Path

def load_config(config_path="configs/project_config.json"):
    """Load project configuration"""
    with open(config_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def get_model_size(model):
    """Calculate model parameter size"""
    param_size = 0
    trainable_params = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
        if param.requires_grad:
            trainable_params += param.nelement()
    
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    total_size = (param_size + buffer_size) / 1024**2
    total_params = sum(p.nelement() for p in model.parameters())
    
    return {
        "total_params": total_params,
        "trainable_params": trainable_params,
        "size_mb": total_size
    }

def print_gpu_memory():
    """Print GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
    else:
        print("GPU not in use")
'''

utils_path = PROJECT_ROOT / "utils.py"
with open(utils_path, 'w', encoding='utf-8') as f:
    f.write(utils_code)
print(f"Utility functions file has been created: {utils_path}")

Utility functions file has been created: utils.py
