# Setup

In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
version_1_path = local_path = os.path.expanduser('~/Documents/UChicago_MADS/general_datasets/athletes.csv')

# lakeFS

In [3]:
# run 'python -m lakefs.quickstart' in terminal to start lakeFS locally

In [4]:
ENDPOINT = "http://127.0.0.1:8000/"
ACCESS_KEY = "AKIAIOSFOLQUICKSTART"
SECRET_KEY = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"

## Create a branch called version 1, add csv to that branch, commit the change

In [5]:
! lakectl repo create lakefs://athletes local://example

Repository: lakefs://athletes
[91merror creating repository: not unique[0m
409 Conflict


In [6]:
! lakectl branch create lakefs://athletes/assignment_1 --source lakefs://athletes/main

Source ref: lakefs://athletes/main
[91mbranch already exists: not unique[0m
409 Conflict


In [7]:
! lakectl fs upload -s /Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes.csv lakefs://athletes/assignment_1/athletes.csv

Path: [93mathletes.csv[0m
Modified Time: 2025-10-15 19:56:48 -0400 EDT
Size: 71546909 bytes
Human Size: 71.5 MB
Physical Address: local:///Users/jackploshnick/lakefs/data/block/example/data/g5fd5iiclaanklhe8940/d3o39g2claanklhe898g
Checksum: ade8057a9ad4350dfade9180f021a96d
Content-Type: application/octet-stream


In [8]:
! lakectl commit lakefs://athletes/assignment_1 -m "Add version_1 dataset"

Branch: lakefs://athletes/assignment_1
Commit for branch "assignment_1" completed.

ID: [93m5d983aea16e5b39e4852e5f700083217eb4f18c6557ac1f18be7101ee71ce849[0m
Message: Add version_1 dataset
Timestamp: 2025-10-15 19:56:49 -0400 EDT
Parents: a931d09e4064907b899d9d60be97dae2817c000ec5726bd14aa3471903c5d40c



## update the dataet, make a new commit

In [9]:
data = pd.read_csv(version_1_path)

# Remove not relevant columns
data = data.dropna(subset=['region','age','weight','height','howlong','gender','eat', \
                            'train','background','experience','schedule','howlong', \
                            'deadlift','candj','snatch','backsq','experience',\
                            'background','schedule','howlong'])
data = data.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',\
                            'filthy50','fgonebad','run400','run5k','pullups','train'])

# Remove Outliers

data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]

data = data[(data['deadlift'] > 0) & (data['deadlift'] <= 1105)|((data['gender'] == 'Female') \
                & (data['deadlift'] <= 636))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

# Clean Survey Data

decline_dict = {'Decline to answer|': np.nan}
data = data.replace(decline_dict)
data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

data.to_csv('/Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes_v2.csv', index=False)



In [10]:
! lakectl fs upload -s /Users/jackploshnick/Documents/UChicago_MADS/general_datasets/athletes_v2.csv lakefs://athletes/assignment_1/athletes.csv

Path: [93mathletes.csv[0m
Modified Time: 2025-10-15 19:56:52 -0400 EDT
Size: 10774960 bytes
Human Size: 10.8 MB
Physical Address: local:///Users/jackploshnick/lakefs/data/block/example/data/g5fd5iiclaanklhe8940/d3o39h2claanklhe89ag
Checksum: a813ac219d50c5af4de214f4609e5f69
Content-Type: application/octet-stream


In [11]:
! lakectl commit lakefs://athletes/assignment_1 -m "Add version_2 dataset"

Branch: lakefs://athletes/assignment_1
Commit for branch "assignment_1" completed.

ID: [93m0a80c4bb8670a5e16b1aee95add09e3eba54a9f5235bd118cd95235e3ce636c9[0m
Message: Add version_2 dataset
Timestamp: 2025-10-15 19:56:53 -0400 EDT
Parents: 5d983aea16e5b39e4852e5f700083217eb4f18c6557ac1f18be7101ee71ce849



## Get version 1, do eda, run model

In [12]:
import pandas as pd
import subprocess
from io import StringIO

# Replace <commit_id> with your actual first commit ID
commit_id = "7f295082753e2062f2aa34fe22252cd8addc1a3be5037c2454ca84ca7ce335d7"

# Cat the file from that commit
result = subprocess.run(
    ['lakectl', 'fs', 'cat', f'lakefs://athletes/{commit_id}/athletes.csv'],
    capture_output=True,
    text=True
)

# Read directly into pandas
df = pd.read_csv(StringIO(result.stdout))

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features and outcome
features = ['age', 'height', 'weight', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad', 'candj', 'run400', 'run5k', 'candj', 'snatch', 'pullups' ,'backsq']
outcome = ['deadlift']

df = df.dropna(subset=features + outcome)

# Split into X and y
X = df[features]
y = df[outcome]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")


R²: 0.970
RMSE: 73.165


## Get version 2, do eda, run model

In [14]:
import pandas as pd
import subprocess
from io import StringIO

# Replace <commit_id> with your actual first commit ID
commit_id = "7f295082753e2062f2aa34fe22252cd8addc1a3be5037c2454ca84ca7ce335d7"

# Cat the file from that commit
result = subprocess.run(
    ['lakectl', 'fs', 'cat', f'lakefs://athletes/{commit_id}/athletes.csv'],
    capture_output=True,
    text=True
)

# Read directly into pandas
df = pd.read_csv(StringIO(result.stdout))

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features and outcome
features = ['age', 'height', 'weight', 'candj', 'snatch', 'backsq']
outcome = ['deadlift']

df = df.dropna(subset=features + outcome)

# Split into X and y
X = df[features]
y = df[outcome]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")

R²: 1.000
RMSE: 200.139


V2 model is only marginally worse. .95 r2 vs .97 r2

## Use tensor flow privacy library with the dataset v2 and calculate the metrics for the new DP model.

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf


# -----------------------------
# DP model (manual DP-SGD approx)
# -----------------------------
# Hyperparameters
batch_size = 32
noise_multiplier = 1.0  # sigma
epochs = 10
learning_rate = 0.01

dp_model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

# Training with noise added to gradients
for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train.iloc[i:i+batch_size]
        with tf.GradientTape() as tape:
            logits = dp_model(x_batch, training=True)
            loss = tf.keras.losses.binary_crossentropy(y_batch, logits)
        grads = tape.gradient(loss, dp_model.trainable_variables)
        # Add Gaussian noise to each gradient
        noisy_grads = [g + tf.random.normal(g.shape, stddev=noise_multiplier) for g in grads]
        optimizer.apply_gradients(zip(noisy_grads, dp_model.trainable_variables))

y_pred_dp = (dp_model.predict(X_test) > 0.5).astype(int)
acc_dp = accuracy_score(y_test, y_pred_dp)

# -----------------------------
# Compute approximate DP epsilon
# -----------------------------
def compute_epsilon(q, sigma, steps, delta):
    import numpy as np
    orders = np.arange(2, 64)
    rdp = np.array([steps * q**2 / (2 * sigma**2) for _ in orders])
    eps = min(rdp - np.log(delta) / (orders - 1))
    return eps

steps = epochs * (X_train.shape[0] // batch_size)
sampling_prob = batch_size / X_train.shape[0]
delta = 1e-5
epsilon = compute_epsilon(sampling_prob, noise_multiplier, steps, delta)

# -----------------------------
# Return a new DataFrame with DP predictions
# -----------------------------
df_results = df.copy()
df_results['pred_non_dp'] = np.nan
df_results['pred_dp'] = np.nan

# Assign predictions to test set rows
df_results.loc[y_test.index, 'pred_dp'] = y_pred_dp
df_results['dp_epsilon'] = epsilon


# Evaluate
r2 = r2_score(y_test, y_pred_dp)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_dp))

print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")



[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 785us/step
R²: -0.000
RMSE: 68936.267


In the DP approach, the accuracy of the model has degraged to the point where it is not at all useful