In [2]:
!pip install gdown



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import logging
from datetime import datetime
import os
import csv
import gdown

# Set up logging
log_dir = '/kaggle/working/logs'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f'training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration for hyperparameters (modify these for tuning)
CONFIG = {
    'random_seed': 42,
    'test_size': 0.2,
    'validation_split': 0.2,
    'batch_size': 32,
    'epochs': 100,
    'early_stopping_patience': 10,
    'learning_rate': 0.001,
    'hidden_layers': [64, 32],  # Number of neurons in each hidden layer
    'dropout_rate': 0.3,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'log_metrics_file': '/kaggle/working/metrics.csv',
    'gdrive_file_id': '16IH03soaKK15gvOO4t84ohCP-n2abCYV',  # Replace with your Google Drive file ID
    'csv_file_name': 'dataset_subset.csv',  # Name of the CSV file after download
}

# Set random seed for reproducibility
torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])
if CONFIG['device'] == 'cuda':
    torch.cuda.manual_seed(CONFIG['random_seed'])

class TennisOutcomeNN(nn.Module):
    """Feedforward neural network for tennis outcome prediction."""
    def __init__(self, input_dim, num_classes, hidden_layers, dropout_rate):
        super(TennisOutcomeNN, self).__init__()
        layers = []
        prev_dim = input_dim
        
        # Hidden layers
        for units in hidden_layers:
            layers.append(nn.Linear(prev_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            prev_dim = units
        
        # Output layer
        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def load_and_preprocess_data():
    """Download CSV from Google Drive and preprocess the dataset."""
    logger.info("Downloading CSV from Google Drive...")
    
    # Download file using gdown
    file_id = CONFIG['gdrive_file_id']
    output_path = os.path.join('/kaggle/working', CONFIG['csv_file_name'])
    gdown.download(f'https://drive.google.com/uc?id={file_id}', output_path, quiet=False)
    
    if not os.path.exists(output_path):
        logger.error(f"Failed to download file to {output_path}")
        raise FileNotFoundError(f"File {output_path} not found after download")
    
    logger.info("Loading dataset...")
    df = pd.read_csv(output_path)
    
    # Select features and target
    numerical_features = ['rally_length', 'serve_depth']
    categorical_features = [
        'serve_type', 'serve_direction', 'shot_1_type', 'shot_1_direction',
        'shot_2_type', 'shot_2_direction', 'last_shot_type', 'last_shot_direction'
    ]
    target = 'outcome'
    
    # Handle missing values
    df[numerical_features] = df[numerical_features].fillna(0)
    df[categorical_features] = df[categorical_features].fillna('unknown')
    
    # Convert categorical columns to strings to avoid type mismatch
    logger.info("Converting categorical columns to strings...")
    for col in categorical_features:
        # Log data types and unique values for debugging
        logger.info(f"Column {col} data types: {df[col].apply(type).value_counts().to_dict()}")
        logger.info(f"Column {col} unique values: {df[col].unique()}")
        df[col] = df[col].astype(str)
    
    # Verify data types after conversion
    logger.info("Data types after conversion:")
    for col in categorical_features:
        logger.info(f"Column {col} data type: {df[col].dtype}")
    
    # Encode target variable
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target])
    
    # Log class distribution
    class_counts = pd.Series(y).value_counts()
    class_names = label_encoder.inverse_transform(class_counts.index)
    logger.info("Class distribution:")
    for name, count in zip(class_names, class_counts):
        logger.info(f"{name}: {count}")
    
    # Features
    X = df[numerical_features + categorical_features]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ])
    
    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=CONFIG['test_size'] + CONFIG['validation_split'],
        random_state=CONFIG['random_seed'], stratify=y
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=CONFIG['test_size'] / (CONFIG['test_size'] + CONFIG['validation_split']),
        random_state=CONFIG['random_seed'], stratify=y_temp
    )
    
    # Fit and transform
    X_train = preprocessor.fit_transform(X_train)
    X_val = preprocessor.transform(X_val)
    X_test = preprocessor.transform(X_test)
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)
    
    # Create DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
    
    logger.info(f"Training data shape: {X_train.shape}")
    logger.info(f"Validation data shape: {X_val.shape}")
    logger.info(f"Test data shape: {X_test.shape}")
    
    return train_loader, val_loader, test_loader, label_encoder, preprocessor

def train_model(model, train_loader, val_loader, criterion, optimizer, device):
    """Train the model with early stopping."""
    best_val_loss = float('inf')
    patience_counter = 0
    metrics = []
    
    # Initialize metrics CSV
    with open(CONFIG['log_metrics_file'], 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Epoch', 'Train Loss', 'Train Accuracy', 'Val Loss', 'Val Accuracy'])
    
    logger.info("Starting model training...")
    for epoch in range(CONFIG['epochs']):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss /= train_total
        train_accuracy = train_correct / train_total
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss /= val_total
        val_accuracy = val_correct / val_total
        
        # Log metrics
        metrics.append([epoch + 1, train_loss, train_accuracy, val_loss, val_accuracy])
        with open(CONFIG['log_metrics_file'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([epoch + 1, train_loss, train_accuracy, val_loss, val_accuracy])
        
        logger.info(
            f"Epoch {epoch+1}/{CONFIG['epochs']} - "
            f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
            f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}"
        )
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), '/kaggle/working/best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= CONFIG['early_stopping_patience']:
                logger.info(f"Early stopping triggered after epoch {epoch+1}")
                break
    
    logger.info("Training completed.")
    return metrics

def evaluate_model(model, test_loader, label_encoder, device):
    """Evaluate the model on the test set."""
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    
    # Convert numerical labels back to class names
    y_true = label_encoder.inverse_transform(y_true)
    y_pred = label_encoder.inverse_transform(y_pred)
    
    # Classification report
    report = classification_report(y_true, y_pred)
    logger.info(f"Classification Report:\n{report}")

def main():
    """Main function to run the prediction pipeline."""
    # Load and preprocess data
    train_loader, val_loader, test_loader, label_encoder, preprocessor = load_and_preprocess_data()
    
    # Initialize model
    input_dim = next(iter(train_loader))[0].shape[1]
    num_classes = len(label_encoder.classes_)
    model = TennisOutcomeNN(
        input_dim=input_dim,
        num_classes=num_classes,
        hidden_layers=CONFIG['hidden_layers'],
        dropout_rate=CONFIG['dropout_rate']
    ).to(CONFIG['device'])
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])
    
    # Train model
    metrics = train_model(model, train_loader, val_loader, criterion, optimizer, CONFIG['device'])
    
    # Load best model
    model.load_state_dict(torch.load('/kaggle/working/best_model.pth'))
    
    # Evaluate model
    evaluate_model(model, test_loader, label_encoder, CONFIG['device'])
    
    # Save model
    torch.save(model.state_dict(), '/kaggle/working/tennis_outcome_model.pth')
    logger.info("Model saved to '/kaggle/working/tennis_outcome_model.pth'")
    
    # Instructions for tuning and Google Drive setup
    logger.info("""
    To use a CSV file from Google Drive:
    1. Right-click the CSV file in Google Drive, select 'Share', and set to 'Anyone with the link' or 'Viewer'.
    2. Copy the shareable link (e.g., https://drive.google.com/file/d/FILE_ID/view?usp=sharing).
    3. Extract the FILE_ID (string between '/d/' and '/view').
    4. Update CONFIG['gdrive_file_id'] with the FILE_ID in this script.
    
    To tune hyperparameters, modify the CONFIG dictionary. Options include:
    - learning_rate: Try [0.0001, 0.001, 0.01]
    - hidden_layers: Try different architectures, e.g., [128, 64], [32, 16]
    - dropout_rate: Try [0.2, 0.4, 0.5]
    - batch_size: Try [16, 64, 128]
    - epochs: Increase if model underfits
    Metrics are saved to 'metrics.csv' for analysis.
    Logs are saved to the 'logs' directory.
    If the TypeError persists, check the logged unique values for categorical columns
    and ensure the dataset has consistent data types.
    """)

if __name__ == "__main__":
    main()

Downloading...
From (original): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV
From (redirected): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV&confirm=t&uuid=f6cbd7f7-3e89-4f84-ad96-f5ef58574448
To: /kaggle/working/dataset_subset.csv
100%|██████████| 143M/143M [00:03<00:00, 47.0MB/s] 
  df = pd.read_csv(output_path)
  model.load_state_dict(torch.load('/kaggle/working/best_model.pth'))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
from datetime import datetime
import pickle
import logging
import warnings
import gdown
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(filename='training_log_transformer.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

def log_and_print(message):
    logger.info(message)
    print(message)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
log_and_print(f"Using device: {device}")

# Load and preprocess data
def load_data(file_path):
    log_and_print("Loading dataset...")
    df = pd.read_csv(file_path)
    log_and_print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")
    return df

def preprocess_data(df):
    # Standardize outcome labels
    outcome_mapping = {
        'Winner': 'Winner', 'Forced Error': 'Forced Error', 'Unforced Error': 'Unforced Error',
        'Ace': 'Winner', 'Double Fault': 'Unforced Error'
    }
    df['outcome'] = df['outcome'].map(outcome_mapping)
    df = df.dropna(subset=['outcome'])
    log_and_print(f"After cleaning, dataset has {len(df)} rows")

    # Select features
    feature_columns = [
        'serve_type', 'serve_direction', 'serve_depth', 'is_second_serve',
        'rally_length', 'shot_1_type', 'shot_1_direction', 'shot_1_depth',
        'shot_2_type', 'shot_2_direction', 'shot_2_depth',
        'shot_3_type', 'shot_3_direction', 'shot_3_depth',
        'shot_4_type', 'shot_4_direction', 'shot_4_depth',
        'shot_5_type', 'shot_5_direction', 'shot_5_depth',
        'last_shot_type', 'last_shot_direction', 'last_shot_depth'
    ]
    df_features = df[feature_columns].copy()
    df_target = df['outcome']

    # Handle missing values
    for col in df_features.columns:
        if df_features[col].dtype == 'object':
            df_features[col] = df_features[col].fillna('None')
        else:
            df_features[col] = df_features[col].fillna(0)

    # Convert categorical columns to strings to avoid type mismatch
    categorical_columns = [col for col in df_features.columns if df_features[col].dtype == 'object']
    log_and_print("Converting categorical columns to strings...")
    for col in categorical_columns:
        # Log data types and unique values for debugging
        log_and_print(f"Column {col} data types: {df_features[col].apply(type).value_counts().to_dict()}")
        log_and_print(f"Column {col} unique values: {df_features[col].unique()}")
        df_features[col] = df_features[col].astype(str)
    
    # Verify data types after conversion
    log_and_print("Data types after conversion:")
    for col in categorical_columns:
        log_and_print(f"Column {col} data type: {df_features[col].dtype}")

    # Encode categorical features
    vocab_sizes = {}
    encoders = {}
    for col in categorical_columns:
        encoders[col] = LabelEncoder()
        df_features[col] = encoders[col].fit_transform(df_features[col])
        vocab_sizes[col] = len(encoders[col].classes_)

    # Encode target
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(df_target)
    log_and_print(f"Class distribution: {dict(zip(target_encoder.classes_, np.bincount(y)))}")

    return df_features, y, categorical_columns, vocab_sizes, encoders, target_encoder

# Custom Dataset
class TennisDataset(Dataset):
    def __init__(self, X, y, categorical_columns):
        self.X = X
        self.y = y
        self.categorical_columns = categorical_columns
        self.numerical_columns = [col for col in X.columns if col not in categorical_columns]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Prepare sequence: treat each feature as a token
        cat_features = torch.tensor([self.X[col].iloc[idx] for col in self.categorical_columns], dtype=torch.long)
        num_features = torch.tensor([self.X[col].iloc[idx] for col in self.numerical_columns], dtype=torch.float)
        label = torch.tensor(self.y[idx], dtype=torch.long)
        return cat_features, num_features, label

# Transformer Model
class TennisTransformer(nn.Module):
    def __init__(self, vocab_sizes, num_numerical, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, num_classes=3, dropout=0.3):
        super(TennisTransformer, self).__init__()
        self.d_model = d_model

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(vocab_size, d_model) for col, vocab_size in vocab_sizes.items()
        })

        # Linear layer for numerical features
        self.numerical_layer = nn.Linear(num_numerical, d_model)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Output layer
        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, cat_features, num_features):
        # Embed categorical features
        cat_embeds = []
        for i, col in enumerate(self.embeddings.keys()):
            emb = self.embeddings[col](cat_features[:, i])
            cat_embeds.append(emb)
        cat_embeds = torch.stack(cat_embeds, dim=1)

        # Process numerical features
        num_embeds = self.numerical_layer(num_features).unsqueeze(1)

        # Combine embeddings
        x = torch.cat([cat_embeds, num_embeds], dim=1)  # [batch, seq_len, d_model]
        # log_and_print(f"Input to transformer shape: {x.shape}")

        # Transformer encoding
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.dropout(x)
        out = self.fc(x)
        return out

# Training function
def train_model(model, train_loader, val_loader, num_epochs=50, patience=5):
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.1, 1.0]).to(device))  # Slight weight for Forced Error
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    best_val_loss = float('inf')
    epochs_no_improve = 0
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for cat_features, num_features, labels in train_loader:
            cat_features, num_features, labels = cat_features.to(device), num_features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(cat_features, num_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(labels)
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for cat_features, num_features, labels in val_loader:
                cat_features, num_features, labels = cat_features.to(device), num_features.to(device), labels.to(device)
                outputs = model(cat_features, num_features)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * len(labels)
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        scheduler.step(val_loss)
        log_and_print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'model_output_transformer/best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                log_and_print("Early stopping triggered")
                break

    return train_losses, val_losses

# Evaluation function
def evaluate_model(model, loader, target_encoder):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for cat_features, num_features, labels in loader:
            cat_features, num_features = cat_features.to(device), num_features.to(device)
            outputs = model(cat_features, num_features)
            _, preds = torch.max(outputs, 1)
            y_true.extend(labels.numpy())
            y_pred.extend(preds.cpu().numpy())

    # Classification report
    report = classification_report(y_true, y_pred, target_names=target_encoder.classes_, output_dict=True)
    log_and_print("\nClassification Report:")
    log_and_print(classification_report(y_true, y_pred, target_names=target_encoder.classes_))

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_encoder.classes_, yticklabels=target_encoder.classes_)
    plt.title('Confusion Matrix')
    plt.savefig('model_output_transformer/confusion_matrix.png')
    plt.close()

    # Precision-Recall curves
    y_scores = []
    model.eval()
    with torch.no_grad():
        for cat_features, num_features, _ in loader:
            cat_features, num_features = cat_features.to(device), num_features.to(device)
            outputs = model(cat_features, num_features)
            y_scores.append(torch.softmax(outputs, dim=1).cpu().numpy())
    y_scores = np.vstack(y_scores)
    y_true_bin = np.eye(len(target_encoder.classes_))[y_true]

    plt.figure(figsize=(10, 8))
    for i, class_name in enumerate(target_encoder.classes_):
        precision, recall, _ = precision_recall_curve(y_true_bin[:, i], y_scores[:, i])
        plt.plot(recall, precision, label=f'{class_name} (AP={np.mean(precision):.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.savefig('model_output_transformer/precision_recall_curve.png')
    plt.close()

    return report, cm

# Main execution
def main():
    start_time = datetime.now()
    output_dir = 'model_output_transformer'
    os.makedirs(output_dir, exist_ok=True)

    CONFIG = {
        'log_metrics_file': '/kaggle/working/metrics.csv',
        'gdrive_file_id': '16IH03soaKK15gvOO4t84ohCP-n2abCYV',  # Replace with your Google Drive file ID
        'csv_file_name': 'dataset_subset.csv',  # Name of the CSV file after download
    }

    # Load and preprocess data
    file_id = CONFIG['gdrive_file_id']
    output_path = os.path.join('/kaggle/working', CONFIG['csv_file_name'])
    gdown.download(f'https://drive.google.com/uc?id={file_id}', output_path, quiet=False)
    df = load_data(output_path)
    X, y, categorical_columns, vocab_sizes, encoders, target_encoder = preprocess_data(df)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    log_and_print(f"Training set: {len(X_train)} samples, Test set: {len(X_test)} samples")

    # Create datasets and dataloaders
    train_dataset = TennisDataset(X_train, y_train, categorical_columns)
    test_dataset = TennisDataset(X_test, y_test, categorical_columns)
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

    # Initialize model
    num_numerical = len([col for col in X.columns if col not in categorical_columns])
    model = TennisTransformer(vocab_sizes, num_numerical, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, num_classes=3).to(device)
    log_and_print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")

    # Train model
    log_and_print("Starting training...")
    train_losses, val_losses = train_model(model, train_loader, test_loader)

    # Plot training history
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('model_output_transformer/training_history.png')
    plt.close()

    # Evaluate model
    log_and_print("Evaluating model...")
    report, cm = evaluate_model(model, test_loader, target_encoder)

    # Check for overfitting
    train_acc = accuracy_score(y_train, [model(torch.tensor(X_train[cat_cols].values, dtype=torch.long).to(device), 
                                            torch.tensor(X_train[num_cols].values, dtype=torch.float).to(device)).argmax(1).cpu().numpy() 
                                       for cat_cols, num_cols, _ in [next(iter(train_loader))]][0])
    test_acc = report['accuracy']
    log_and_print(f"Training Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}, Difference: {train_acc - test_acc:.4f}")

    # Save artifacts
    with open(os.path.join(output_dir, 'classification_report.pkl'), 'wb') as f:
        pickle.dump(report, f)
    with open(os.path.join(output_dir, 'target_encoder.pkl'), 'wb') as f:
        pickle.dump(target_encoder, f)
    with open(os.path.join(output_dir, 'feature_encoders.pkl'), 'wb') as f:
        pickle.dump(encoders, f)
    with open(os.path.join(output_dir, 'feature_columns.pkl'), 'wb') as f:
        pickle.dump(X.columns.tolist(), f)

    # Prediction function
    def predict(serve_data, rally_data):
        model.eval()
        with torch.no_grad():
            # Preprocess input
            input_df = pd.DataFrame([serve_data | rally_data])
            for col in X.columns:
                if col not in input_df:
                    input_df[col] = 'None' if col in categorical_columns else 0
            for col in categorical_columns:
                input_df[col] = encoders[col].transform(input_df[col])
            cat_features = torch.tensor([input_df[col].iloc[0] for col in categorical_columns], dtype=torch.long).unsqueeze(0).to(device)
            num_features = torch.tensor([input_df[col].iloc[0] for col in X.columns if col not in categorical_columns], dtype=torch.float).unsqueeze(0).to(device)
            output = model(cat_features, num_features)
            pred = torch.argmax(output, dim=1).cpu().numpy()
            return target_encoder.inverse_transform(pred)[0]

    # Save prediction function
    with open(os.path.join(output_dir, 'predict.pkl'), 'wb') as f:
        pickle.dump(predict, f)

    log_and_print(f"Training completed in {datetime.now() - start_time}")
    log_and_print("Artifacts saved in model_output_transformer/")

if __name__ == "__main__":
    main()

Using device: cuda


Downloading...
From (original): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV
From (redirected): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV&confirm=t&uuid=2e74df7f-d4ad-45b8-9513-8b89027ff1a6
To: /kaggle/working/dataset_subset.csv
100%|██████████| 143M/143M [00:01<00:00, 95.6MB/s] 


Loading dataset...
Dataset loaded with 991359 rows and 41 columns
After cleaning, dataset has 960585 rows
Converting categorical columns to strings...
Column serve_type data types: {<class 'int'>: 540242, <class 'str'>: 420343}
Column serve_type unique values: [4 5 6 0 '4' '5' '6' '0' 'g']
Column shot_1_type data types: {<class 'str'>: 960585}
Column shot_1_type unique values: ['b' 'f' 'r' 's' 'None' 'm' 'u' 'l' 'q' 'y' 'i' 't' 'w' '3' 'h' 'v' 'o'
 '7' '2' '1' 'z' 'e' '&']
Column shot_2_type data types: {<class 'str'>: 960585}
Column shot_2_type unique values: ['b' 'f' 'None' 'z' 's' 'v' 'u' 'h' 'j' 'y' 't' 'r' 'o' 'i' 'm' 'k' 'l'
 'p' 'q']
Column shot_3_type data types: {<class 'str'>: 960585}
Column shot_3_type unique values: ['b' 'None' 'f' 'm' 's' 'l' 'r' 'y' 'i' 'z' 'v' 'u' 'p' 'h' 'j' 'o' 't'
 'q' 'k']
Column shot_4_type data types: {<class 'str'>: 960585}
Column shot_4_type unique values: ['None' 'b' 'i' 'f' 'o' 'v' 'z' 'r' 'h' 's' 'm' 'y' 'u' 't' 'l' 'j' 'k'
 'q' 'p']
Column sh

KeyError: "None of [Index([   (tensor(1), tensor(6), tensor(0), tensor(0), tensor(0), tensor(0), tensor(3)),\n          (tensor(3), tensor(5), tensor(0), tensor(0), tensor(0), tensor(0), tensor(2)),\n         (tensor(2), tensor(6), tensor(2), tensor(1), tensor(13), tensor(2), tensor(4)),\n          (tensor(3), tensor(8), tensor(0), tensor(0), tensor(0), tensor(0), tensor(4)),\n       (tensor(1), tensor(15), tensor(1), tensor(13), tensor(18), tensor(1), tensor(3)),\n         (tensor(2), tensor(8), tensor(12), tensor(2), tensor(1), tensor(2), tensor(4)),\n         (tensor(3), tensor(8), tensor(2), tensor(13), tensor(2), tensor(2), tensor(4)),\n       (tensor(1), tensor(16), tensor(18), tensor(7), tensor(9), tensor(0), tensor(11)),\n          (tensor(3), tensor(5), tensor(0), tensor(0), tensor(0), tensor(0), tensor(2)),\n        (tensor(3), tensor(6), tensor(2), tensor(13), tensor(2), tensor(2), tensor(18)),\n       ...\n          (tensor(2), tensor(8), tensor(0), tensor(0), tensor(0), tensor(0), tensor(4)),\n         (tensor(1), tensor(8), tensor(2), tensor(2), tensor(12), tensor(1), tensor(4)),\n          (tensor(1), tensor(6), tensor(0), tensor(0), tensor(0), tensor(0), tensor(3)),\n         (tensor(2), tensor(8), tensor(1), tensor(1), tensor(1), tensor(2), tensor(18)),\n         (tensor(2), tensor(6), tensor(1), tensor(13), tensor(2), tensor(0), tensor(4)),\n          (tensor(1), tensor(8), tensor(2), tensor(8), tensor(5), tensor(0), tensor(7)),\n          (tensor(2), tensor(8), tensor(2), tensor(1), tensor(2), tensor(2), tensor(3)),\n         (tensor(3), tensor(6), tensor(12), tensor(1), tensor(0), tensor(0), tensor(3)),\n         (tensor(1), tensor(16), tensor(2), tensor(0), tensor(0), tensor(0), tensor(4)),\n         (tensor(3), tensor(16), tensor(2), tensor(0), tensor(0), tensor(0), tensor(4))],\n      dtype='object', length=256)] are in the [columns]"