In [10]:
!pip install pyreadr
#Import Datasets
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import time
import numpy as np
import pyreadr  # Needed to read RDS files


# Config
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
cols_x = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age']
col_y = 'outcome'
folds = 5
res = []

print(f"Running XGBoost on bootstrapped datasets\n" + "-"*50)

def run_cv(X, y):
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1
    )
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)
    t0 = time.time()
    acc = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return np.mean(acc), time.time() - t0

for sz in sizes:
    print(f"\nDataset Size: {sz}")
    try:
        # Read the correct RDS file
        t0 = time.time()
        result = pyreadr.read_r(f"bootstrap_data_{sz}.rds")
        df = result[None]  # Extract the dataframe
        print(f"Loaded in {time.time() - t0:.2f} seconds")

        if not all(c in df.columns for c in cols_x + [col_y]):
            raise ValueError(f"Missing columns. Found columns: {df.columns.tolist()}")

        X = df[cols_x]
        y = df[col_y].astype(int)
        print("Running 5-fold Cross-Validation...")

        acc, t_cv = run_cv(X, y)
        print(f"Time: {t_cv:.2f} seconds | Accuracy: {acc:.4f}")

        res.append({"Method": "XGBoost (scikit-learn CV)", "Dataset Size": sz, "Accuracy": acc, "Time (s)": t_cv})

    except Exception as e:
        print(f"Error with dataset size {sz}: {e}")
        res.append({"Method": "XGBoost (scikit-learn CV)", "Dataset Size": sz, "Accuracy": "Error", "Time (s)": "Error"})

# Report
print("\n" + "="*50)
print("Summary of Results")
print("="*50)
df_res = pd.DataFrame(res)
print(df_res.to_string(index=False))
df_res.to_csv("xgb_summary_results.csv", index=False)
print("\nResults saved to xgb_summary_results.csv ✅")


Running XGBoost on bootstrapped datasets
--------------------------------------------------

Dataset Size: 100
Loaded in 0.00 seconds
Running 5-fold Cross-Validation...
Time: 2.76 seconds | Accuracy: 0.9300

Dataset Size: 1000
Loaded in 0.01 seconds
Running 5-fold Cross-Validation...
Time: 0.38 seconds | Accuracy: 0.9440

Dataset Size: 10000
Loaded in 0.03 seconds
Running 5-fold Cross-Validation...
Time: 1.12 seconds | Accuracy: 0.9754

Dataset Size: 100000
Loaded in 0.15 seconds
Running 5-fold Cross-Validation...
Time: 3.83 seconds | Accuracy: 0.9868

Dataset Size: 1000000
Loaded in 1.55 seconds
Running 5-fold Cross-Validation...
Time: 38.01 seconds | Accuracy: 0.9919

Dataset Size: 10000000
Loaded in 17.20 seconds
Running 5-fold Cross-Validation...




Time: 401.38 seconds | Accuracy: 0.9932

Summary of Results
                   Method  Dataset Size  Accuracy   Time (s)
XGBoost (scikit-learn CV)           100  0.930000   2.763152
XGBoost (scikit-learn CV)          1000  0.944000   0.378408
XGBoost (scikit-learn CV)         10000  0.975400   1.117135
XGBoost (scikit-learn CV)        100000  0.986760   3.834116
XGBoost (scikit-learn CV)       1000000  0.991872  38.009035
XGBoost (scikit-learn CV)      10000000  0.993176 401.379248

Results saved to xgb_summary_results.csv ✅
