# Data Privacy Final Project

Jordan Bourdeau, Casey Forey

### Imports

In [413]:
import numpy as np
import os
import pandas as pd
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import zipfile

### Reading in the data

In [414]:
url: str = 'https://jbourde2.w3.uvm.edu/data-privacy/data.zip'
file_path: str = 'data/powerlifting-data.csv'

# If the .zip file doesn't already exist, download it from the Silk server.
if not os.path.exists('data.zip'):
    try:
        r = requests.get(url, allow_redirects=True)
        print('Downloading zip file from server')
        open('data.zip', 'wb').write(r.content)
        print('Zip file downloaded from server')
    except Exception as e:
        print(e)
        print('Unable to download zip file from remote server')
        exit(1)

# If the data folder doesn't already exist, unzip the data zip
if not os.path.exists('data/'):
    try:
        with zipfile.ZipFile('data.zip') as zip:
            zip.extractall()
        print('Zip file extracted')
        df = pd.read_csv(file_path)
        print('Data read in')
    except Exception as e:
        print(e)
        print('No zip file to extract from')
        exit(1)

print('Loading dataframe')
df: pd.DataFrame = pd.read_csv(file_path)
# Drop unneeded columns
df = df.drop(['BirthYearClass', 'Division', 'AgeClass', 'Dots', 'Wilks', 'Glossbrenner', 'Goodlift', 
                'Federation', 'MeetCountry', 'MeetState', 'MeetTown', 'WeightClassKg',
                'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg',], axis=1)
print('Dataframe loaded')


Loading dataframe


  df: pd.DataFrame = pd.read_csv(file_path)


Dataframe loaded


### Privacy Budget

In [415]:
rho: float = 1.0
rho_i: float = None

### Differential Privacy Mechanisms

In [416]:
def gaussian_mech_zCDP_vec(vec, sensitivity, rho):
    sigma = np.sqrt((sensitivity**2) / (2 * rho))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]

### Unit of privacy conversions

In [417]:
# As is, the unit of privacy is person-meet-division-age division

# 1. Remove all records which are not from a full-power division
# Drop any rows with NaN values in it.
# Fill NaN tested rows first
df['Tested'] = df['Tested'].fillna(0)
df = df.dropna()

# 2. Limit to 1 record per meet (based on meet name/date for a person)
person_meet_columns: list[str] = ['Name', 'MeetName', 'Date']
df = df.drop_duplicates(subset=person_meet_columns, keep='first')

# 3. Convert person-meet unit of privacy to person-year
# Note: Person-state would protect a person while they reside in a specific state.
# Identifying in terms of how we determine a 'unique' person, all data can be identifying
# Full name + sex + competes in powerlifting is incredibly identifying so this is a proxy for a 'person'
df['Year'] = df['Date'].map(lambda x: int(x[:4]))
identifying_columns: list[str] = ['Name', 'Sex', 'Year']
histogram = df.groupby(identifying_columns).size()
noisy_histogram = gaussian_mech_zCDP_vec(histogram, 1, rho/2)
df = df.drop(['Name'], axis=1)

# This is our scalar value to divide rho by (max number of times a person has competed in a given year)
noisy_max: float = np.max(noisy_histogram)
rho_i = rho / (2 * noisy_max)

### Convert sex and tested columns into binary values.

In [418]:
# Drop Mx columns for a simplifying assumption
df = df[df['Sex'] != 'Mx']

# Convert binary categorical columns into binary values
sex: dict = {'M': 1,'F': 0}
df['Tested'] = df['Tested'].map(lambda x: 1 if x == 'Yes' else x)
df['Sex'] = df['Sex'].map(sex)

### Convert attempts into attempt weight and success.

In [419]:
# If an attempt was missed, it has a '-' put in front of it
# Separate each attempt into the weight loaded and whether it was successful.
attempt_columns: list[str] = ['Squat1Kg', 'Squat2Kg', 'Squat3Kg',
                              'Bench1Kg', 'Bench2Kg', 'Bench3Kg',
                              'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg']

for column in attempt_columns:
    df[f"{column}Made"] = df[column].map(lambda x: 1 if x > 0 else 0)
    df[column] = np.abs(df[column])

best_attempt_columns: list[str] = ['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg']

# If someone didn't hit any lifts, convert their best 3rd to 0
for column in best_attempt_columns:
    df[column] = df[column].map(lambda x: x if x > 0 else 0)


### Create One-Hot-Encodings

In [420]:
# Create one-hot encodings if they don't exist
categorical_columns: list[str] = ['Event', 'Equipment', 'ParentFederation']
if 'encoded_features' not in locals():
    # Create the One-Hot-Encoding
    encoded_features: list[pd.DataFrame] = [df[column].str.get_dummies("|") for column in categorical_columns if column in df.columns]

# Drop the categorical columns if they are in the dataframe
df = df.drop(categorical_columns, axis=1, errors='ignore')

# Concatenate one-hot-encoded columns along the column axis
for features in encoded_features:
    for column in features.columns:
        df[column] = features[column]

## Drop the remaining unneeded columns

### Machine Learning Algorithms 

In [421]:
''' 
Machine learning functions (loss, gradient, noisy gradient descent).
'''

# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the logistic loss function.
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

# This is the gradient of the logistic loss
# The gradient is a vector that indicates the rate of change of the loss in each direction
def gradient(theta, xi, yi):
    exponent = yi * (xi.dot(theta))
    return - (yi*xi) / (1+np.exp(exponent))

def avg_grad(theta, X, y):
    grads = [gradient(theta, xi, yi) for xi, yi in zip(X, y)]
    return np.mean(grads, axis=0)

# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta, X_test, y_test):
    return np.sum(predict(X_test, theta) == y_test) / X_test.shape[0]

# L2 Clipping
def L2_clip(v, b):
    norm = np.linalg.norm(v, ord=2)
    if norm > b:
        return b * (v / norm)
    else:
        return v

def gradient_sum(theta, X, y, b):
    gradients = [L2_clip(gradient(theta, x_i, y_i), b) for x_i, y_i in zip(X,y)]
    # sum query
    # L2 sensitivity is b (by clipping performed above)
    return np.sum(gradients, axis=0)

def noisy_gradient_descent_zCDP(X_train, y_train, iterations, rho, learning_rate):
    theta = np.zeros(X_train.shape[1])
    b = 3
    rho_count = 0.05 * rho
    rho_i = 0.95 * rho / iterations
    noisy_count = gaussian_mech_zCDP_vec([X_train.shape[0]], noisy_max, rho_count)[0]
    for i in range(iterations):
        clipped_gradient_sum = gradient_sum(theta, X_train, y_train, b)
        noisy_gradient_sum = np.array(gaussian_mech_zCDP_vec(clipped_gradient_sum, b, rho_i))
        noisy_avg_gradient = noisy_gradient_sum / noisy_count
        theta = theta - noisy_avg_gradient * learning_rate
    return theta


### Implementation

In [422]:
iterations: int = 20
learning_rate: float = 0.5
num_models: int = 3
rho_i /= num_models

In [423]:
# 1. Can we predict whether someone is tested vs. untested based on their best lifts and bodyweight?
print("Can we predict whether someone is tested vs. untested based on their best lifts and weight?")

y = df['Tested'].values
X = df[['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'BodyweightKg']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=42)

print("Scikit-learn reference:")
model: LogisticRegression = LogisticRegression().fit(X_train, y_train)
print(np.sum(model.predict(X_test) == y_test) / X_test.shape[0])

print("Noisy gradient descent:")
theta = noisy_gradient_descent_zCDP(X_train, y_train, iterations, rho_i, learning_rate)
print(accuracy(theta, X_test, y_test))

Can we predict whether someone is tested vs. untested based on their best lifts and weight?
Scikit-learn reference:


0.7806108897742364
Noisy gradient descent:


  return - (yi*xi) / (1+np.exp(exponent))


0.7806108897742364


In [424]:
# 2. Can we predict whether someone competes in a tested vs. untested based on the equipment they use
print("Can we predict whether someone is tested vs. untested based on their equipment?")

equipment_columns: list[str] = ['Multi-ply', 'Raw', 'Single-ply', 'Straps', 'Unlimited', 'Wraps']
y = df['Tested'].values
X = df[equipment_columns].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=42)

print("Scikit-learn reference:")
model: LogisticRegression = LogisticRegression().fit(X_train, y_train)
print(np.sum(model.predict(X_test) == y_test) / X_test.shape[0])

print("Noisy gradient descent:")
theta = noisy_gradient_descent_zCDP(X_train, y_train, iterations, rho_i, learning_rate)
print(accuracy(theta, X_test, y_test))

Can we predict whether someone is tested vs. untested based on their equipment?
Scikit-learn reference:
0.8090305444887118
Noisy gradient descent:
0.7806108897742364


In [425]:
# 3. Can we predict whether someone competes in a tested vs. untested based on the equipment they use?
print("Can we predict whether someone will hit their 3rd deadlift based on their previous lifts and other metrics?")

predictive_columns: list[str] = ['Squat1KgMade', 'Squat2KgMade', 'Squat3KgMade', 
                                'Bench1KgMade', 'Bench2KgMade', 'Bench3KgMade', 
                                'Deadlift1KgMade', 'Deadlift2KgMade']
y = df['Deadlift3KgMade'].values
X = df[predictive_columns].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=42)

print("Scikit-learn reference:")
model: LogisticRegression = LogisticRegression().fit(X_train, y_train)
print(np.sum(model.predict(X_test) == y_test) / X_test.shape[0])

print("Noisy gradient descent:")
theta = noisy_gradient_descent_zCDP(X_train, y_train, iterations, rho_i, learning_rate)
print(accuracy(theta, X_test, y_test))

Can we predict whether someone will hit their 3rd deadlift based on their previous lifts and other metrics?
Scikit-learn reference:
0.6616998671978752
Noisy gradient descent:
0.6404249667994688
