# Setup

In [10]:
import pandas as pd
import numpy as np
import os


In [11]:
version_1_path = local_path = os.path.expanduser('~/Documents/UChicago_MADS/general_datasets/athletes.csv')

# Weights and Baises

In [None]:
# go to https://wandb.ai/authorize?ref=models to get your API key

In [5]:
import wandb


In [7]:
wandb.init(project="my_project", entity="jploshnick-university-of-chicago")

In [8]:
# Create a dataset artifact
dataset_artifact = wandb.Artifact(
    name="my_dataset",
    type="dataset",
    description="Initial version of the dataset"
)

# Add files
dataset_artifact.add_file("/Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes.csv")

# Log the artifact
wandb.log_artifact(dataset_artifact)

# Finish the run
wandb.finish()


## update the dataet, make a new commit

In [12]:
data = pd.read_csv(version_1_path)

# Remove not relevant columns
data = data.dropna(subset=['region','age','weight','height','howlong','gender','eat', \
                            'train','background','experience','schedule','howlong', \
                            'deadlift','candj','snatch','backsq','experience',\
                            'background','schedule','howlong'])
data = data.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',\
                            'filthy50','fgonebad','run400','run5k','pullups','train'])

# Remove Outliers

data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]

data = data[(data['deadlift'] > 0) & (data['deadlift'] <= 1105)|((data['gender'] == 'Female') \
                & (data['deadlift'] <= 636))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

# Clean Survey Data

decline_dict = {'Decline to answer|': np.nan}
data = data.replace(decline_dict)
data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

data.to_csv('/Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes_v2.csv', index=False)



In [14]:
wandb.init(project="my_project", entity="jploshnick-university-of-chicago")

# Create a dataset artifact
dataset_artifact = wandb.Artifact(
    name="my_dataset_v2",
    type="dataset",
    description="Second version of the dataset"
)

# Add files
dataset_artifact.add_file("/Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes_v2.csv")

# Log the artifact
wandb.log_artifact(dataset_artifact)

# Finish the run
wandb.finish()


## Get version 1, do eda, run model

In [26]:
import wandb
import pandas as pd

# Initialize W&B (can use mode="offline" if you want)
wandb.init(project="my_project", entity="jploshnick-university-of-chicago")

# Retrieve the dataset artifact (latest version)
artifact = wandb.use_artifact('jploshnick-university-of-chicago/my_project/my_dataset:latest', type='dataset')

# Download the artifact to a local directory
artifact_dir = artifact.download()

# Load the CSV into a pandas DataFrame
csv_path = f"{artifact_dir}/athletes.csv"
df = pd.read_csv(csv_path)

# Finish the run
wandb.finish()


[34m[1mwandb[0m: Downloading large artifact 'my_dataset:latest', 68.23MB. 1 files...
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 00:00:00.2 (293.0MB/s)


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features and outcome
features = ['age', 'height', 'weight', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'candj', 'run400', 'run5k', 'candj', 'snatch', 'pullups' ,'backsq']
outcome = ['deadlift']

df = df.dropna(subset=features + outcome)

# Split into X and y
X = df[features]
y = df[outcome]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")


R²: 0.970
RMSE: 73.165


## Get version 2, do eda, run model

In [25]:
artifact_dir


'/Users/jackploshnick/Documents/MLOps_class/assignment_1/artifacts/my_dataset_v2:v0'

In [28]:
import wandb
import pandas as pd

# Initialize W&B (can use mode="offline" if you want)
wandb.init(project="my_project", entity="jploshnick-university-of-chicago")

# Retrieve the dataset artifact (latest version)
artifact = wandb.use_artifact('jploshnick-university-of-chicago/my_project/my_dataset_v2:latest', type='dataset')

# Download the artifact to a local directory
artifact_dir = artifact.download()

# Load the CSV into a pandas DataFrame
csv_path = f"{artifact_dir}/athletes_v2.csv"
df = pd.read_csv(csv_path)

# Finish the run
wandb.finish()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features and outcome
features = ['age', 'height', 'weight', 'candj', 'snatch', 'backsq']
outcome = ['deadlift']

df = df.dropna(subset=features + outcome)

# Split into X and y
X = df[features]
y = df[outcome]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 0.838
RMSE: 39.120


V2 model is indeed worse. .83 r2 vs .97 r2

## Use tensor flow privacy library with the dataset v2 and calculate the metrics for the new DP model.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# -----------------------------
# Prepare data
# -----------------------------
features = ['age', 'height', 'weight', 'candj', 'snatch', 'backsq']
outcome = ['deadlift']

df = df.dropna(subset=features + outcome)

X = df[features].astype(np.float32)
y = df[outcome].astype(np.float32).values.reshape(-1, 1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------
# DP Linear Regression Model
# -----------------------------
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1)
])

# DP optimizer parameters
learning_rate = 0.01
noise_multiplier = 0.5   # adjust for privacy-accuracy tradeoff
l2_norm_clip = 5.0
batch_size = 32
epochs = 100

optimizer = DPKerasSGDOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=batch_size,  # should match batch size
    learning_rate=learning_rate
)

# Loss function (per-example)
loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)

# -----------------------------
# Custom DP training loop
# -----------------------------
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset = dataset.shuffle(buffer_size=len(X_train)).batch(batch_size)

for epoch in range(epochs):
    for x_batch, y_batch in dataset:
        with tf.GradientTape() as tape:
            predictions = model(x_batch, training=True)
            per_example_loss = loss_fn(y_batch, predictions)
            loss = tf.reduce_mean(per_example_loss)

        # Compute DP gradients (do NOT pass gradient_clip_norm)
        grads_and_vars = optimizer._compute_gradients(
            loss,
            var_list=model.trainable_variables,
            tape=tape
        )
        optimizer.apply_gradients(grads_and_vars)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs} completed")

# -----------------------------
# Evaluate on test set
# -----------------------------
y_pred = model(X_test).numpy()

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"DP Linear Regression R²: {r2:.3f}")
print(f"DP Linear Regression RMSE: {rmse:.3f}")

# -----------------------------
# Compute DP epsilon
# -----------------------------
N = X_train.shape[0]
delta = 1e-5  # typical choice

epsilon, _ = compute_dp_sgd_privacy.compute_dp_sgd_privacy(
    n=N,
    batch_size=batch_size,
    noise_multiplier=noise_multiplier,
    epochs=epochs,
    delta=delta
)


# Evaluate
r2 = r2_score(y_test, y_pred_dp)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_dp))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")



2025-10-15 20:46:24.905071: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
R²: -13.825
RMSE: 374.101


In the DP approach, the accuracy of the model has degraged to the point where it is not at all useful