In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set display options for better notebook rendering
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

# 1. Data Loading and Exploration
def load_and_explore_data(file_path):
    """
    Load the dataset and perform initial exploration.
    """
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    
    # Display basic info
    print(f"Dataset shape: {df.shape}")
    print(f"Number of seasons: {df['Season'].nunique()}")
    print(f"Seasons included: {sorted(df['Season'].unique())}")
    
    # Display info about target variables
    target_columns = ['Tournament Winner?', 'Tournament Championship?', 'Final Four?']
    for col in target_columns:
        if col in df.columns:
            true_count = df[col].sum()
            total_count = df[col].count()
            print(f"{col}: {true_count} out of {total_count} ({true_count/total_count*100:.2f}%)")
    
    # Quick look at data
    display(df.head())
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print("\nColumns with missing values:")
    display(missing_values[missing_values > 0])
    
    return df

# 2. Feature Engineering and Selection
def engineer_features(df):
    """
    Engineer and select features for the model.
    """
    print("Engineering features...")
    
    # Create copy to avoid modifying original
    df_processed = df.copy()
    
    # Handling missing values
    numeric_columns = df_processed.select_dtypes(include=['float64', 'int64']).columns
    df_processed[numeric_columns] = df_processed[numeric_columns].fillna(df_processed[numeric_columns].mean())
    
    # Feature engineering - Create some ratio features
    # Offensive to Defensive efficiency ratio
    if 'Adjusted Offensive Efficiency' in df_processed.columns and 'Adjusted Defensive Efficiency' in df_processed.columns:
        df_processed['Off_Def_Efficiency_Ratio'] = df_processed['Adjusted Offensive Efficiency'] / df_processed['Adjusted Defensive Efficiency']
    
    # Experience to average height ratio (team experience level per height)
    if 'Experience' in df_processed.columns and 'AvgHeight' in df_processed.columns:
        df_processed['Experience_Height_Ratio'] = df_processed['Experience'] / df_processed['AvgHeight']
    
    # Net Tempo (how much faster team plays compared to opponents)
    if 'Avg Possession Length (Offense)' in df_processed.columns and 'Avg Possession Length (Defense)' in df_processed.columns:
        df_processed['Net_Possession_Length'] = df_processed['Avg Possession Length (Defense)'] - df_processed['Avg Possession Length (Offense)']
    
    # Tournament experience feature
    if 'Season' in df_processed.columns and 'Full Team Name' in df_processed.columns:
        # For each team, check how many previous seasons they appear in
        team_appearances = df_processed.groupby(['Full Team Name', 'Season']).size().reset_index(name='count')
        team_exp = team_appearances.groupby('Full Team Name').cumcount()
        team_appearances['Tournament_Experience'] = team_exp
        # Merge back to the main dataframe
        df_processed = df_processed.merge(team_appearances[['Full Team Name', 'Season', 'Tournament_Experience']], 
                                         on=['Full Team Name', 'Season'], how='left')
        df_processed['Tournament_Experience'] = df_processed['Tournament_Experience'].fillna(0)
    
    # Convert categorical features with one-hot encoding
    categorical_cols = ['Short Conference Name', 'Region', 'Post-Season Tournament']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols)
    
    # Feature selection - excluding target variables and non-predictive columns
    exclude_columns = ['Tournament Winner?', 'Tournament Championship?', 'Final Four?', 
                      'Full Team Name', 'Mapped ESPN Team Name', 'Since', 'Current Coach']
    
    features = [col for col in df_processed.columns if col not in exclude_columns]
    
    # Show feature count
    print(f"Total features after engineering: {len(features)}")
    
    return df_processed, features

# 3. Define target and prepare train/test sets
def prepare_train_test_data(df, features, target_column, test_size=0.2, random_state=42):
    """
    Prepare training and testing datasets for a specific target.
    """
    print(f"Preparing training and testing data for {target_column}...")
    
    # Make sure target column exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in dataset")
    
    # Handle missing values in target
    if df[target_column].isna().any():
        print(f"Warning: {df[target_column].isna().sum()} missing values in target. Filling with False.")
        df[target_column] = df[target_column].fillna(False)
    
    # Convert boolean to int
    if df[target_column].dtype == bool:
        df[target_column] = df[target_column].astype(int)
    
    # Select only complete rows
    valid_rows = df[features + [target_column]].dropna()
    
    # Handle potential class imbalance
    positive_samples = valid_rows[valid_rows[target_column] == 1]
    negative_samples = valid_rows[valid_rows[target_column] == 0]
    
    print(f"Positive samples: {len(positive_samples)}, Negative samples: {len(negative_samples)}")
    
    # If severe imbalance, consider balancing techniques
    if len(positive_samples) / len(valid_rows) < 0.1:
        print("Warning: Severe class imbalance detected")
        # Option: Undersample majority class if enough data
        if len(negative_samples) > 10 * len(positive_samples):
            neg_sample = negative_samples.sample(n=min(len(positive_samples) * 3, len(negative_samples)), 
                                                random_state=random_state)
            balanced_data = pd.concat([positive_samples, neg_sample])
            X = balanced_data[features]
            y = balanced_data[target_column]
            print(f"Balanced dataset: {len(balanced_data)} samples")
        else:
            X = valid_rows[features]
            y = valid_rows[target_column]
    else:
        X = valid_rows[features]
        y = valid_rows[target_column]
    
    # Identify potentially important features through correlation
    top_corr = pd.DataFrame({'feature': features, 
                            'correlation': [abs(X[f].corr(y)) for f in features]})
    top_features = top_corr.sort_values('correlation', ascending=False).head(20)
    
    print("\nTop correlated features:")
    display(top_features)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Normalize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, features

# 4. Build and train the model
def build_and_train_model(X_train, y_train, X_test, y_test, input_dim, class_weight=None):
    """
    Build and train a TensorFlow model for the given dataset.
    """
    print("Building and training the model...")
    
    # Set up callbacks for early stopping and learning rate reduction
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=10, restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001
        )
    ]
    
    # Build the model
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )
    
    # Display model summary
    model.summary()
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        class_weight=class_weight,
        verbose=2
    )
    
    # Evaluate the model
    test_results = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {test_results[0]:.4f}")
    print(f"Test accuracy: {test_results[1]:.4f}")
    
    return model, history

# 5. Visualize training history
def visualize_training_history(history):
    """
    Visualize the training and validation metrics.
    """
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot training & validation accuracy
    axes[0].plot(history.history['accuracy'])
    axes[0].plot(history.history['val_accuracy'])
    axes[0].set_title('Model Accuracy')
    axes[0].set_ylabel('Accuracy')
    axes[0].set_xlabel('Epoch')
    axes[0].legend(['Train', 'Validation'], loc='best')
    
    # Plot training & validation loss
    axes[1].plot(history.history['loss'])
    axes[1].plot(history.history['val_loss'])
    axes[1].set_title('Model Loss')
    axes[1].set_ylabel('Loss')
    axes[1].set_xlabel('Epoch')
    axes[1].legend(['Train', 'Validation'], loc='best')
    
    plt.tight_layout()
    plt.show()
    
    # Plot AUC and other metrics if available
    if 'auc' in history.history:
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        axes[0].plot(history.history['auc'])
        axes[0].plot(history.history['val_auc'])
        axes[0].set_title('Model AUC')
        axes[0].set_ylabel('AUC')
        axes[0].set_xlabel('Epoch')
        axes[0].legend(['Train', 'Validation'], loc='best')
        
        # Plot precision and recall
        if 'precision' in history.history:
            axes[1].plot(history.history['precision'])
            axes[1].plot(history.history['recall'])
            axes[1].plot(history.history['val_precision'])
            axes[1].plot(history.history['val_recall'])
            axes[1].set_title('Precision and Recall')
            axes[1].set_ylabel('Score')
            axes[1].set_xlabel('Epoch')
            axes[1].legend(['Precision', 'Recall', 'Val Precision', 'Val Recall'], loc='best')
        
        plt.tight_layout()
        plt.show()

# 6. Make predictions and evaluate
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model's performance with detailed metrics and visualizations.
    """
    print("Evaluating model performance...")
    
    # Make predictions
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Positive'], 
                yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return y_pred, y_pred_prob

# 7. Create a prediction function
def predict_team_performance(model, scaler, features, team_data):
    """
    Predict performance for new teams.
    """
    # Preprocess the team data
    team_features = team_data[features]
    team_features_scaled = scaler.transform(team_features)
    
    # Make prediction
    prediction_prob = model.predict(team_features_scaled)
    
    return prediction_prob

# 8. Feature importance analysis
def analyze_feature_importance(model, features):
    """
    Analyze which features are most important for predictions.
    """
    # For a simple dense network, we can look at the weights of the first layer
    weights = model.layers[0].get_weights()[0]
    importance = np.abs(weights).mean(axis=1)
    
    # Create a DataFrame to display feature importance
    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': importance
    }).sort_values('Importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    sns.barplot(x='Importance', y='Feature', data=top_features)
    plt.title('Top 20 Most Important Features')
    plt.tight_layout()
    plt.show()
    
    return feature_importance

2025-04-23 00:05:18.575566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745366718.888642      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745366718.971944      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
