In [10]:
#modeling file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             roc_curve, precision_recall_curve)
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

# Set random seed for reproducibility
RANDOM_STATE = 1234


In [11]:
# Load data
data = pd.read_csv('./thesis_data.csv')

In [12]:
data.columns

Index(['room', 'game', 'hand', 'won', 'player0_preflop_fold',
       'player0_flop_fold', 'player0_turn_fold', 'player0_river_fold',
       'player0_bet', 'player0_preflop_bet', 'player0_flop_bet',
       'player0_turn_bet', 'player0_river_bet', 'opponent_preflop_bets',
       'opponent_flop_bets', 'opponent_turn_bets', 'opponent_river_bets',
       'opponent_bet', 'starthandstrategy', 'flop_hand', 'turn_hand',
       'river_hand'],
      dtype='object')

In [13]:
# 1. Data Preparation Function
def prepare_stage_data(data, stage_features, target_column, previous_target=None):
    """
    Prepare data for a specific game stage
    """
    if previous_target:
        # Filter out players who folded in previous stages
        stage_data = data[data[previous_target] == 0].copy()
    else:
        stage_data = data.copy()

    X = stage_data[stage_features]
    y = stage_data[target_column]

    # Check class imbalance
    class_ratio = y.value_counts(normalize=True)
    print(f"Class distribution: {class_ratio.to_dict()}")

    return X, y



In [14]:
# 2. Model Training and Evaluation Function
def train_evaluate_model(X_train, X_test, y_train, y_test, model, model_name, stage_name):
    """
    Train, evaluate, and visualize model performance
    """
    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Fold', 'Fold'],
                yticklabels=['Not Fold', 'Fold'])
    plt.title(f'{model_name} - {stage_name} Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f'{model_name}_{stage_name}_confusion_matrix.png', bbox_inches='tight')
    plt.close()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {metrics["roc_auc"]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - {stage_name} ROC Curve')
    plt.legend()
    plt.savefig(f'{model_name}_{stage_name}_roc_curve.png', bbox_inches='tight')
    plt.close()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'{model_name}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} - {stage_name} Precision-Recall Curve')
    plt.legend()
    plt.savefig(f'{model_name}_{stage_name}_pr_curve.png', bbox_inches='tight')
    plt.close()

    return metrics, model





In [15]:
# 3. Feature Importance Analysis
def plot_feature_importance(model, feature_names, model_name, stage_name, top_n=20):
    """
    Plot feature importance for tree-based models
    """
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[-top_n:]

        plt.figure(figsize=(10, 8))
        plt.title(f'{model_name} - {stage_name} Feature Importance')
        plt.barh(range(len(indices)), importances[indices], align='center')
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.tight_layout()
        plt.savefig(f'{model_name}_{stage_name}_feature_importance.png', bbox_inches='tight')
        plt.close()

        # Return top features
        return pd.Series(importances, index=feature_names).sort_values(ascending=False).head(top_n)
    return None

In [16]:
# 4. Hyperparameter Tuning with Class Weight Handling
def tune_model(X_train, y_train, model_type='rf'):
    """
    Hyperparameter tuning with handling for class imbalance
    """
    # Calculate class weights
    classes = np.unique(y_train)
    weights = class_weight.compute_class_weight('balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    if model_type == 'lr':
        # Logistic Regression pipeline with scaling
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
        ])

        param_grid = {
            'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'clf__penalty': ['l1', 'l2'],
            'clf__solver': ['liblinear'],
            'clf__class_weight': [None, class_weights]
        }

    elif model_type == 'rf':
        pipeline = RandomForestClassifier(random_state=RANDOM_STATE)
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'class_weight': [None, 'balanced', class_weights]
        }

    elif model_type == 'gb':
        pipeline = GradientBoostingClassifier(random_state=RANDOM_STATE)
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0]
        }

    # Use stratified K-Fold for imbalanced data
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best ROC AUC: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_





In [17]:
# 5. Handle Class Imbalance
def balance_data(X_train, y_train, method='smote'):
    """
    Handle class imbalance using different techniques
    """
    if method == 'smote':
        smote = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = smote.fit_resample(X_train, y_train)
    elif method == 'class_weight':
        return X_train, y_train  # Weights handled in model
    else:
        # Random oversampling
        from imblearn.over_sampling import RandomOverSampler
        ros = RandomOverSampler(random_state=RANDOM_STATE)
        X_res, y_res = ros.fit_resample(X_train, y_train)

    return X_res, y_res

In [18]:
# =============== MAIN EXECUTION ===============

allfeatures = ['room', 'game', 'hand', 'won', 'player0_preflop_fold',
       'player0_flop_fold', 'player0_turn_fold', 'player0_river_fold',
       'player0_bet', 'player0_preflop_bet', 'player0_flop_bet',
       'player0_turn_bet', 'player0_river_bet', 'opponent_preflop_bets',
       'opponent_flop_bets', 'opponent_turn_bets', 'opponent_river_bets',
       'opponent_bet', 'starthandstrategy', 'flop_hand', 'turn_hand',
       'river_hand']

# Define features for each stage
preflop_features = ['player0_preflop_bet', 'opponent_preflop_bets', 'starthandstrategy']  
flop_features = ['player0_preflop_bet', 'opponent_preflop_bets', 'starthandstrategy',
                 'player0_flop_bet', 'opponent_flop_bets', 'flop_hand']     
turn_features = ['player0_preflop_bet', 'opponent_preflop_bets', 'starthandstrategy',
                 'player0_flop_bet', 'opponent_flop_bets', 'flop_hand',
                 'player0_turn_bet', 'opponent_turn_bets', 'turn_hand']     
river_features = ['player0_preflop_bet', 'opponent_preflop_bets', 'starthandstrategy',
                 'player0_flop_bet', 'opponent_flop_bets', 'flop_hand',
                 'player0_turn_bet', 'opponent_turn_bets', 'turn_hand',
                 'player0_bet','opponent_river_bets', 'river_hand']    

stages = [
    {'name': 'Pre-flop', 'features': preflop_features,
     'target': 'player0_preflop_fold', 'previous_target': None},

    {'name': 'Flop', 'features': flop_features,
     'target': 'player0_flop_fold', 'previous_target': 'player0_preflop_fold'},

    {'name': 'Turn', 'features': turn_features,
     'target': 'player0_turn_fold', 'previous_target': 'player0_flop_fold'},

    {'name': 'River', 'features': river_features,
     'target': 'player0_river_fold', 'previous_target': 'player0_turn_fold'}
]

# Models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

In [19]:
# Store results
results = {}

for stage in stages:
    print(f"\n{'='*50}")
    print(f"PROCESSING STAGE: {stage['name']}")
    print(f"{'='*50}")

    # Prepare data
    X, y = prepare_stage_data(
        data,
        stage['features'],
        stage['target'],
        stage['previous_target']
    )

    # Train-test split (stratified for class imbalance)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    # Handle class imbalance
    X_train_bal, y_train_bal = balance_data(X_train, y_train, method='smote')

    stage_results = {}

    for model_name, model in models.items():
        print(f"\nTraining {model_name} for {stage['name']}")

        # Hyperparameter tuning
        model_type = 'lr' if 'Logistic' in model_name else 'rf' if 'Random' in model_name else 'gb'
        tuned_model = tune_model(X_train_bal, y_train_bal, model_type)

        # Train and evaluate
        metrics, trained_model = train_evaluate_model(
            X_train_bal, X_test, y_train_bal, y_test,
            tuned_model, model_name, stage['name']
        )

        # Store metrics
        stage_results[model_name] = metrics

        # Feature importance for tree-based models
        if 'Forest' in model_name or 'Boosting' in model_name:
            feature_imp = plot_feature_importance(
                trained_model, X.columns, model_name, stage['name']
            )
            print(f"\nTop features for {model_name} at {stage['name']}:\n{feature_imp}")

    results[stage['name']] = stage_results


PROCESSING STAGE: Pre-flop
Class distribution: {0: 0.9175642087821044, 1: 0.08243579121789561}


ValueError: could not convert string to float: 'Call'

In [None]:
# Print final results
print("\n\nFINAL RESULTS:")
for stage, stage_models in results.items():
    print(f"\n{stage} Stage:")
    for model_name, metrics in stage_models.items():
        print(f"{model_name}:")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1']:.4f}")
        print(f"  ROC AUC: {metrics['roc_auc']:.4f}")

# Save results to CSV
results_df = pd.DataFrame.from_dict({(i,j): results[i][j]
                                   for i in results.keys()
                                   for j in results[i].keys()},
                                  orient='index')
results_df.to_csv('model_results.csv')