In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("Week_11_data.csv")
df.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,1,145,74,10,156,40.5,0.123,22,0
1,7,176,50,41,84,27.5,0.496,24,1
2,6,103,78,60,278,35.9,0.718,61,1
3,1,97,58,27,326,27.7,0.452,30,0
4,3,157,58,42,81,46.8,0.285,57,1


In [3]:
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
results = []

for size in sizes:
    if size <= len(df):
        # Sample dataset to the required size
        sampled_df = df.sample(size, random_state=42, replace=(size > len(df)))
        X_sample = sampled_df.drop('outcome', axis=1)
        y_sample = sampled_df['outcome']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

        # Train model and measure time
        model = XGBClassifier(random_state=42)
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Evaluate on test set
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'Size': size,
            'Test Accuracy': accuracy,
            'Training Time (s)': train_time
        })
    else:
        # Skip sizes larger than the actual dataset unless using replacement
        print(f"Skipping size {size} as it exceeds the dataset size")

# Print results table
if results:
    results_df = pd.DataFrame(results)
    print("\n--- Performance by Dataset Size ---")
    print(results_df.to_string(index=False))


--- Performance by Dataset Size ---
    Size  Test Accuracy  Training Time (s)
     100       0.900000           0.081132
    1000       0.940000           0.046495
   10000       0.980000           0.142963
  100000       0.988600           0.753442
 1000000       0.992205           8.827513
10000000       0.993104          83.182443
