In [1]:
!pip install gdown



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
import pickle
import logging
import warnings
import gdown
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(filename='training_log_extended.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

def log_and_print(message):
    logger.info(message)
    print(message)

# Directory structure setup
BASE_OUTPUT_DIR = 'model_outputs'
MODEL_TYPES = ['xgboost']
for model_type in MODEL_TYPES:
    os.makedirs(os.path.join(BASE_OUTPUT_DIR, model_type), exist_ok=True)

# Load and preprocess data
def load_data(file_path):
    log_and_print("Loading dataset...")
    try:
        df = pd.read_csv(file_path)
        log_and_print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")
        return df
    except Exception as e:
        log_and_print(f"Error loading data: {str(e)}")
        raise

def preprocess_data(df):
    try:
        # Standardize outcome labels
        outcome_mapping = {
            'Winner': 'Winner', 'Forced Error': 'Forced Error', 'Unforced Error': 'Unforced Error',
            'Ace': 'Winner', 'Double Fault': 'Unforced Error'
        }
        df['outcome'] = df['outcome'].map(outcome_mapping)
        df = df.dropna(subset=['outcome'])
        log_and_print(f"After cleaning, dataset has {len(df)} rows")

        # Select features
        feature_columns = [
            'serve_type', 'serve_direction', 'serve_depth', 'is_second_serve',
            'rally_length', 'shot_1_type', 'shot_1_direction', 'shot_1_depth',
            'shot_2_type', 'shot_2_direction', 'shot_2_depth',
            'shot_3_type', 'shot_3_direction', 'shot_3_depth',
            'shot_4_type', 'shot_4_direction', 'shot_4_depth',
            'shot_5_type', 'shot_5_direction', 'shot_5_depth',
            'last_shot_type', 'last_shot_direction', 'last_shot_depth'
        ]
        df_features = df[feature_columns].copy()
        df_target = df['outcome']

        # Handle missing values
        for col in df_features.columns:
            if df_features[col].dtype == 'object':
                df_features[col] = df_features[col].fillna('None')
            else:
                df_features[col] = df_features[col].fillna(0)

        # Convert categorical columns to strings
        categorical_columns = [col for col in df_features.columns if df_features[col].dtype == 'object']
        for col in categorical_columns:
            df_features[col] = df_features[col].astype(str)
        
        # Encode categorical features
        encoders = {}
        for col in categorical_columns:
            encoders[col] = LabelEncoder()
            df_features[col] = encoders[col].fit_transform(df_features[col])

        # Encode target
        target_encoder = LabelEncoder()
        y = target_encoder.fit_transform(df_target)
        log_and_print(f"Class distribution: {dict(zip(target_encoder.classes_, np.bincount(y)))}")

        return df_features, y, categorical_columns, encoders, target_encoder
    except Exception as e:
        log_and_print(f"Error in preprocessing: {str(e)}")
        raise

# Evaluation function for XGBoost
def evaluate_model(model, X_test, y_test, target_encoder, model_type):
    try:
        # Prepare test data
        X_test_np = np.hstack([X_test[cat].values.reshape(-1, 1) for cat in X_test.columns])
        
        # Predictions
        y_true = y_test
        y_pred = model.predict(X_test_np)
        y_scores = model.predict_proba(X_test_np)

        y_true_bin = np.eye(len(target_encoder.classes_))[y_true]

        # Classification report
        report = classification_report(y_true, y_pred, target_names=target_encoder.classes_, output_dict=True)
        log_and_print(f"\n[{model_type}] Classification Report:")
        log_and_print(classification_report(y_true, y_pred, target_names=target_encoder.classes_))

        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_encoder.classes_, yticklabels=target_encoder.classes_)
        plt.title(f'Confusion Matrix - {model_type}')
        plt.savefig(f'{BASE_OUTPUT_DIR}/{model_type}/confusion_matrix.png')
        plt.close()

        # Precision-Recall curves
        plt.figure(figsize=(10, 8))
        for i, class_name in enumerate(target_encoder.classes_):
            precision, recall, _ = precision_recall_curve(y_true_bin[:, i], y_scores[:, i])
            plt.plot(recall, precision, label=f'{class_name} (AP={np.mean(precision):.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {model_type}')
        plt.legend()
        plt.savefig(f'{BASE_OUTPUT_DIR}/{model_type}/precision_recall_curve.png')
        plt.close()

        # ROC curves
        plt.figure(figsize=(10, 8))
        for i, class_name in enumerate(target_encoder.classes_):
            fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_scores[:, i])
            auc_score = roc_auc_score(y_true_bin[:, i], y_scores[:, i])
            plt.plot(fpr, tpr, label=f'{class_name} (AUC={auc_score:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_type}')
        plt.legend()
        plt.savefig(f'{BASE_OUTPUT_DIR}/{model_type}/roc_curve.png')
        plt.close()

        return report, cm, y_true, y_pred, y_scores
    except Exception as e:
        log_and_print(f"Error in evaluation {model_type}: {str(e)}")
        raise

# Precision-Recall imbalance analysis
def analyze_precision_recall(y_true, y_pred, y_scores, target_encoder, model_type):
    try:
        y_true_bin = np.eye(len(target_encoder.classes_))[y_true]
        
        # Per-class precision-recall analysis
        analysis = {}
        for i, class_name in enumerate(target_encoder.classes_):
            precision, recall, _ = precision_recall_curve(y_true_bin[:, i], y_scores[:, i])
            analysis[class_name] = {
                'precision': np.mean(precision),
                'recall': np.mean(recall),
                'class_freq': np.sum(y_true_bin[:, i]) / len(y_true)
            }
        
        # Visualize precision-recall differences
        plt.figure(figsize=(10, 6))
        classes = list(analysis.keys())
        precisions = [analysis[c]['precision'] for c in classes]
        recalls = [analysis[c]['recall'] for c in classes]
        x = np.arange(len(classes))
        width = 0.35
        
        plt.bar(x - width/2, precisions, width, label='Precision')
        plt.bar(x + width/2, recalls, width, label='Recall')
        plt.xticks(x, classes)
        plt.ylabel('Score')
        plt.title(f'Precision vs Recall by Class - {model_type}')
        plt.legend()
        plt.savefig(f'{BASE_OUTPUT_DIR}/{model_type}/precision_recall_comparison.png')
        plt.close()
        
        # Log analysis
        log_and_print(f"\n[{model_type}] Precision-Recall Analysis:")
        for class_name, metrics in analysis.items():
            log_and_print(f"{class_name}: Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}, Class Freq={metrics['class_freq']:.4f}")

        return analysis
    except Exception as e:
        log_and_print(f"Error in precision-recall analysis: {str(e)}")
        raise

# XGBoost implementation with hyperparameter tuning
def train_xgboost(X_train, y_train, X_test, y_test, categorical_columns, target_encoder):
    try:
        X_train_np = np.hstack([X_train[cat].values.reshape(-1, 1) for cat in X_train.columns])
        X_test_np = np.hstack([X_test[cat].values.reshape(-1, 1) for cat in X_test.columns])
        
        # Hyperparameter tuning
        param_dist = {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.3],
            'n_estimators': [100, 200, 300],
            'subsample': [0.7, 0.8, 0.9],
            'colsample_bytree': [0.7, 0.8, 0.9]
        }
        
        xgb = XGBClassifier(random_state=42)
        search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=3, scoring='f1_weighted', n_jobs=-1)
        search.fit(X_train_np, y_train)
        
        best_model = search.best_estimator_
        log_and_print(f"Best XGBoost parameters: {search.best_params_}")
        
        # Feature importance
        feature_names = X_train.columns
        importance = best_model.feature_importances_
        plt.figure(figsize=(12, 6))
        sorted_idx = np.argsort(importance)[::-1]
        plt.bar(range(len(importance)), importance[sorted_idx])
        plt.xticks(range(len(importance)), feature_names[sorted_idx], rotation=45)
        plt.title('XGBoost Feature Importance')
        plt.tight_layout()
        plt.savefig(f'{BASE_OUTPUT_DIR}/xgboost/feature_importance.png')
        plt.close()
        
        # Log feature importance analysis
        log_and_print("\nTop 5 important features:")
        for idx in sorted_idx[:5]:
            log_and_print(f"{feature_names[idx]}: {importance[idx]:.4f}")
        
        return best_model
    except Exception as e:
        log_and_print(f"Error in XGBoost training: {str(e)}")
        raise

# Main execution
def main():
    start_time = datetime.now()
    CONFIG = {
        'log_metrics_file': '/kaggle/working/metrics.csv',
        'gdrive_file_id': '16IH03soaKK15gvOO4t84ohCP-n2abCYV',
        'csv_file_name': 'dataset_subset.csv',
    }

    try:
        # Load and preprocess data
        file_id = CONFIG['gdrive_file_id']
        output_path = os.path.join('/kaggle/working', CONFIG['csv_file_name'])
        gdown.download(f'https://drive.google.com/uc?id={file_id}', output_path, quiet=False)
        df = load_data(output_path)
        X, y, categorical_columns, encoders, target_encoder = preprocess_data(df)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        log_and_print(f"Training set: {len(X_train)} samples, Test set: {len(X_test)} samples")

        # Save testing dataset as CSV
        test_df = X_test.copy()
        test_df['outcome'] = target_encoder.inverse_transform(y_test)
        test_csv_path = os.path.join(BASE_OUTPUT_DIR, 'test_dataset.csv')
        test_df.to_csv(test_csv_path, index=False)
        log_and_print(f"Testing dataset saved to {test_csv_path}")

        # Train and evaluate XGBoost
        log_and_print("\nTraining XGBoost model...")
        xgb_model = train_xgboost(X_train, y_train, X_test, y_test, categorical_columns, target_encoder)
        report, cm, y_true, y_pred, y_scores = evaluate_model(xgb_model, X_test, y_test, target_encoder, 'xgboost')
        pr_analysis = analyze_precision_recall(y_true, y_pred, y_scores, target_encoder, 'xgboost')
        results = {
            'xgboost': {
                'report': report,
                'cm': cm,
                'pr_analysis': pr_analysis
            }
        }

        # Save artifacts
        output_dir = f'{BASE_OUTPUT_DIR}/xgboost'
        with open(os.path.join(output_dir, 'results.pkl'), 'wb') as f: 
            pickle.dump(results['xgboost'], f)
        with open(os.path.join(BASE_OUTPUT_DIR, 'target_encoder.pkl'), 'wb') as f:
            pickle.dump(target_encoder, f)
        with open(os.path.join(BASE_OUTPUT_DIR, 'feature_encoders.pkl'), 'wb') as f:
            pickle.dump(encoders, f)
        with open(os.path.join(BASE_OUTPUT_DIR, 'feature_columns.pkl'), 'wb') as f:
            pickle.dump(X.columns.tolist(), f)

        log_and_print(f"\nTotal execution time: {datetime.now() - start_time}")
        log_and_print(f"Artifacts saved in {BASE_OUTPUT_DIR}/")

    except Exception as e:
        log_and_print(f"Error in main execution: {str(e)}")
        raise

main()

Downloading...
From (original): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV
From (redirected): https://drive.google.com/uc?id=16IH03soaKK15gvOO4t84ohCP-n2abCYV&confirm=t&uuid=1be45db0-55e3-44e4-b737-07eb6efae209
To: /kaggle/working/dataset_subset.csv
100%|██████████| 143M/143M [00:01<00:00, 120MB/s]  


Loading dataset...
Dataset loaded with 991359 rows and 41 columns
After cleaning, dataset has 960585 rows
Class distribution: {'Forced Error': 308713, 'Unforced Error': 338515, 'Winner': 313357}
Training set: 768468 samples, Test set: 192117 samples
Testing dataset saved to model_outputs/test_dataset.csv

Training XGBoost model...
Best XGBoost parameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

Top 5 important features:
is_second_serve: 0.2752
last_shot_depth: 0.1409
last_shot_type: 0.1064
last_shot_direction: 0.0842
shot_2_direction: 0.0802

[xgboost] Classification Report:
                precision    recall  f1-score   support

  Forced Error       0.84      0.72      0.77     61743
Unforced Error       0.65      0.79      0.71     67703
        Winner       0.78      0.72      0.75     62671

      accuracy                           0.74    192117
     macro avg       0.76      0.74      0.74    192117
  weighted avg  