In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, r2_score
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing


import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'seaborn'

In [3]:
import numpy as np
import pandas as pd
df = pd.read_csv('./Data/adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
def convert_marital_status(status):
    if status in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']:
        return 'married'
    elif status in ['Never-married', 'Separated', 'Widowed']:
        return 'single'
    else:
        return 'divorced'

df['marital-status'] = df['marital-status'].apply(convert_marital_status)

df['native-country'] = df['native-country'].replace('Outlying-US(Guam-USVI-etc)' , 'US Minor Islands')

df = df.drop(['capital-gain', 'capital-loss', 'fnlwgt'], axis=1)

income_mapping = {'<=50K': 0, '>50K': 1}
df['income'] = df['income'].map(income_mapping)

In [4]:
def fill_missing_categorical(df, column):
    df[column] = df[column].replace('?', np.nan)

    if df[column].notna().all():
        return df

    known = df[df[column].notna()]
    unknown = df[df[column].isna()]

    le = LabelEncoder()
    known[column] = le.fit_transform(known[column])
    X_known = known.drop(column, axis=1)
    y_known = known[column]

    categorical_cols = X_known.select_dtypes(include=['object']).columns

    le_cat = preprocessing.LabelEncoder()
    X_known[categorical_cols] = X_known[categorical_cols].apply(lambda col: le_cat.fit_transform(col.astype(str)))

    clf = RandomForestClassifier()
    clf.fit(X_known, y_known)

    X_unknown = unknown.drop(column, axis=1)

    X_unknown[categorical_cols] = X_unknown[categorical_cols].apply(lambda col: le_cat.fit_transform(col.astype(str)))

    unknown[column] = clf.predict(X_unknown)

    df = pd.concat([known, unknown], axis=0)

    df[column] = le.inverse_transform(df[column])
    
    return df

In [5]:
df = fill_missing_categorical(df, 'native-country')
df = fill_missing_categorical(df, 'occupation')
df = fill_missing_categorical(df, 'workclass')

In [6]:
#Remove the Outliers
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['age'] < Q1 - 1.5 * IQR) | (df['age'] > Q3 + 1.5 * IQR)]

df.drop(outliers.index, inplace=True)

In [7]:
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [8]:
df.drop(['age', 'hours-per-week'], axis=1, inplace=True)
df

Unnamed: 0,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,native-country,income
0,Private,11th,7,single,Machine-op-inspct,Own-child,Black,1,United-States,0
1,Private,HS-grad,9,married,Farming-fishing,Husband,White,1,United-States,0
2,Local-gov,Assoc-acdm,12,married,Protective-serv,Husband,White,1,United-States,1
3,Private,Some-college,10,married,Machine-op-inspct,Husband,Black,1,United-States,1
5,Private,10th,6,single,Other-service,Not-in-family,White,1,United-States,0
...,...,...,...,...,...,...,...,...,...,...
42577,Local-gov,7th-8th,4,single,Adm-clerical,Not-in-family,White,0,United-States,0
42644,Private,Some-college,10,married,Craft-repair,Husband,White,1,United-States,1
44475,Private,Bachelors,13,married,Prof-specialty,Not-in-family,White,1,United-States,0
44969,Private,Some-college,10,single,Adm-clerical,Own-child,White,0,United-States,0


In [9]:
df.reset_index(inplace=True)

In [10]:
df.columns

Index(['index', 'workclass', 'education', 'educational-num', 'marital-status',
       'occupation', 'relationship', 'race', 'gender', 'native-country',
       'income'],
      dtype='object')

In [11]:
columns_to_keep = ['workclass', 'educational-num', 'marital-status', 'occupation', 'gender', 'native-country', 'income']
Features = df[columns_to_keep]

In [62]:
X= Features

X = pd.get_dummies(X, columns=['workclass', 'marital-status', 'occupation', 'native-country'])

X = X.drop(columns=['income'])
y = df['income']


x_train, x_test, y_train, y_test = train_test_split(X , y, test_size = 0.2)

In [63]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)
X1 = scaler.fit_transform(X)

In [64]:

X_fil = []
Y_fil = []

ind  = y.values.sum()

counter = 0
print(ind)
for i in range(len(y.values)):
    if y.values[i] == 0:
        if counter < ind:
            X_fil.append(X1[i])
            Y_fil.append(y[i])
            counter+=1
    else:
        X_fil.append(X1[i])
        Y_fil.append(y[i])

11649


In [65]:
print('the shape of the x train data is ' + str(x_train.shape))
print('the shape of the x test data is ' + str(y_test.shape))

the shape of the x train data is (38900, 68)
the shape of the x test data is (9726,)


In [66]:
Features

Unnamed: 0,workclass,educational-num,marital-status,occupation,gender,native-country,income
0,Private,7,single,Machine-op-inspct,1,United-States,0
1,Private,9,married,Farming-fishing,1,United-States,0
2,Local-gov,12,married,Protective-serv,1,United-States,1
3,Private,10,married,Machine-op-inspct,1,United-States,1
4,Private,6,single,Other-service,1,United-States,0
...,...,...,...,...,...,...,...
48621,Local-gov,4,single,Adm-clerical,0,United-States,0
48622,Private,10,married,Craft-repair,1,United-States,1
48623,Private,13,married,Prof-specialty,1,United-States,0
48624,Private,10,single,Adm-clerical,0,United-States,0


In [67]:
model = LogisticRegression(penalty='l2')
model.fit(X_train_scaled, y_train)

In [68]:
print('model score is: ' + str(round(model.score(X_test_scaled, y_test)* 100, 0)) + '%')

model score is: 83.0%


In [69]:
elastic_net_model = LogisticRegression(penalty='elasticnet', l1_ratio=1.0, solver='saga')
elastic_net_model.fit(X_train_scaled, y_train)

In [70]:
print('model score is: ' + str(round(elastic_net_model.score(X_test_scaled, y_test)* 100, 0)) + '%')

model score is: 83.0%


In [77]:


# import torch
# x_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
# x_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)


import torch
x_train,x_test,y_train,y_test = train_test_split(X_fil,Y_fil,test_size = 0.2,random_state = 42)
x_train_tensor = torch.tensor(x_train, dtype=torch.float64)
x_test_tensor = torch.tensor(x_test, dtype=torch.float64)
y_train_tensor = torch.tensor(y_train, dtype=torch.float64)
y_test_tensor = torch.tensor(y_test, dtype=torch.float64)

In [72]:
import numpy as np
import pandas as pd
# import dcMinMaxFunctions as dc
# import dcor
from scipy.misc import derivative
from sklearn.model_selection import train_test_split
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy import stats
import wandb




class Net_new(nn.Module):
    def __init__(self,p,device=torch.device('cuda')):
        super(Net_new, self ).__init__()
        self.device = device
        self.loss_reg = 0
        self.p =p 
        self.x = 0
        self.y = 0
        self.H_net1 = nn.Sequential(
            nn.Linear(68, 128),
            nn.Sigmoid(),
            nn.Linear(128, 64),
            nn.Sigmoid(),
            nn.Linear(64, 68*68).to(device)
        )
        self.X_net = nn.Sequential(
            nn.Linear(68, 1),
            # nn.ReLU(),
            # nn.Linear(128, 512),
            # nn.ReLU(),
            # nn.Linear(512, 128),
            # nn.ReLU(),
            # nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        def H_mul(z):
            H12 = self.H_net1(z)
            H12= H12.reshape(z.shape[0],d,d)
            x12 = torch.matmul(z,H12)
            return(x12)
    
        
        def batch_jacobian(func, z, create_graph=False):
            # x in shape (Batch, Length)
            def _func_sum(z):
                return func(z).sum(dim=0)
            return torch.squeeze(torch.autograd.functional.jacobian(_func_sum, z, create_graph=create_graph)).permute(1,0,2)
        
        
        device = self.device
       
        x.requires_grad =True
        p = self.p
        self.x = x
        d = x.shape[1]
        bs = x.shape[0]
        x= torch.unsqueeze(x,1)
        z = x.to(device)

        loss_reg = torch.zeros(bs,d).to(device)
        for i in range(p):
            H = self.H_net1(z).to(device)
            H = H.reshape(bs,d,d)
            z = torch.matmul(z,H).to(device)
            J = batch_jacobian(H_mul, z, create_graph=True)
            J_int =-torch.log(torch.abs(torch.det(J)))
            loss_reg = loss_reg + torch.squeeze(torch.autograd.grad(J_int, x,torch.ones_like(J_int),allow_unused=True,create_graph= True)[0]).to(device)
        self.loss_reg = loss_reg
        self.y = z
        y = self.X_net(z)
        return y



        
        
        



     


def gau_ker(u,device = torch.device('cuda')):
    return torch.pow(2*torch.tensor(torch.pi),u.shape[1]/(-2))*torch.exp(torch.bmm(u.view(u.shape[0], 1, u.shape[1]), u.view(u.shape[0],  u.shape[1],1))/(-2)).to(device)


def py_kde(x,X_t,h,device = torch.device('cuda')):
    norm = X_t.shape[0]*(h**x.shape[1])
    prob = torch.zeros(x.shape[0]).to(device)
    for i in range(len(X_t)):
        prob+= (torch.squeeze(gau_ker((x - X_t[i])/h))/norm).to(device)
    return(prob)


def py_kde_der(p_x,x,device = torch.device('cuda')):
    # x.requires_grad = True
    # p_x = py_kde(x,X_t,h)
    return (torch.autograd.grad(p_x,x,torch.ones_like(p_x),allow_unused=True,create_graph=True)[0]).to(device)


def gau_ker_der(X,h):
    N= X.shape[0]
    d = X.shape[1]
    grad = torch.zeros(X.shape)
    for n in range(N):
        for i in range(d):
            for j in range(N):
                grad[n][i]+= torch.exp(-1*torch.dot((X[n]-X[j]),(X[n]-X[j]))/(2*h*h))*(X[n][i] -X[j][i]) /(N*(h**(d+2))*((2*math.pi)**(d/2)))

    return grad

In [73]:
import torch


norms = torch.norm(x_train_tensor, dim=1)

# Find the indices of elements with norm > 10
indices = torch.where(norms > 10)[0]

# Remove elements with norm > 10 from x_train_tensor
y_train_tensor = torch.index_select(y_train_tensor, 0, indices)
x_train_tensor = torch.index_select(x_train_tensor, 0, indices)

norms = torch.norm(x_test_tensor, dim=1)

# Find the indices of elements with norm > 10
indices = torch.where(norms > 10)[0]

# Remove elements with norm > 10 from x_train_tensor
y_test_tensor = torch.index_select(y_test_tensor, 0, indices)
x_test_tensor = torch.index_select(x_test_tensor, 0, indices)
    

In [74]:
def normalize2(x,norm=1):
    n = torch.norm(x,dim=1).max()
    x_normed =norm*x /(n)
    return x_normed
x_train_tensor = normalize2(x_train_tensor)
x_test_tensor = normalize2(x_test_tensor)

In [75]:
model = LogisticRegression(penalty='l2')
model.fit(x_train_tensor, y_train_tensor)
print('model score is: ' + str(round(elastic_net_model.score(x_test_tensor, y_test_tensor)* 100, 0)) + '%')

ValueError: y should be a 1d array, got an array of shape (1310, 68) instead.

In [61]:
x_train_tensor

tensor([[-2.2852e-02, -2.6997e-02, -3.3755e-03,  ..., -6.2089e-02,
         -8.0980e-04, -4.1372e-04],
        [-8.0222e-03,  1.3398e-02, -3.3755e-03,  ..., -6.2089e-02,
         -8.0980e-04, -4.1372e-04],
        [-8.0222e-03,  1.3398e-02, -3.3755e-03,  ..., -6.2089e-02,
         -8.0980e-04, -4.1372e-04],
        ...,
        [ 2.1637e-02, -2.6997e-02, -3.3755e-03,  ..., -6.2089e-02,
         -8.0980e-04, -4.1372e-04],
        [-8.0222e-03,  1.3398e-02, -3.3755e-03,  ..., -6.2089e-02,
         -8.0980e-04, -4.1372e-04],
        [ 2.1637e-02,  1.3398e-02, -3.3755e-03,  ..., -6.2089e-02,
          4.4666e-01, -4.1372e-04]], dtype=torch.float64)

In [53]:
torch.cdist(x_train_tensor,x_train_tensor).mean()

tensor(8.9745, dtype=torch.float64)

In [26]:
import torch
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        out = self.sigmoid(out)
        return out

# Example usage
input_size = 68
model = LogisticRegression(input_size)
x = torch.randn(1, input_size)  # Example input
output = model(x)
print(output)


tensor([[0.4892]], grad_fn=<SigmoidBackward0>)


In [27]:
def test_model(model, test_loader,device=torch.device('cpu')):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            # _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            
            correct += ((outputs>0.5).squeeze() == labels.squeeze()).sum()
            

    # print('Accuracy of the network on the test images: %d %%' % (
    #     100 * correct / total))
    return(100 * correct / total)

In [28]:
def train_model_priv(net,trainloader,x_test,y_test,optimizer,epochs,h,rate=10,device= torch.device('cuda'),print_cond = True,only_reg_flag=0,lr_schedular =None,lambda_loss=1):
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    lr = lr_schedular
    net = net.to(device)
    
    
    criterion = nn.BCELoss()
    for epoch in range(epochs):  # loop over the dataset multiple times
        # scheduler.step()
        running_loss = 0.0
        running_loss_reg = 0.0
        if(lr):
       
            for groups in optimizer.param_groups:
                groups['lr'] = lr(epoch)
        # optimizer.param_groups[0]['lr'] = lr(epoch)
        
        
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            # if(i >100 ):
            #     break
            bs = len(data[0])
            
            inputs = data[0].to(device)
            inputs.requires_grad = True
            labels = data[1].to(device)
            f = py_kde(inputs,inputs,h,device = device)
            f_der = py_kde_der(f,inputs,device = device)

            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = net(inputs)
            if(only_reg_flag==1):
                loss = torch.norm(f_der/f.view(f.shape[0],1)+ net.loss_reg,dim=1).sum()
            elif(only_reg_flag==2):
                loss = criterion(torch.squeeze(outputs),torch.squeeze(labels))
                
            else:
               
                loss = lambda_loss*bs*criterion(torch.squeeze(outputs),torch.squeeze(labels)) + torch.norm(f_der/f.view(f.shape[0],1)+ net.loss_reg,dim=1).sum()
            loss.backward(retain_graph=True)

            optimizer.step()
            loss = loss.detach().cpu()/len(inputs)

          
            if(epoch ==0 and i==0):
                continue
            loss_reg = torch.norm(f_der/f.view(f.shape[0],1)+ net.loss_reg,dim=1).sum().detach().cpu()/len(inputs)
            
            print(loss,loss_reg)
            # wandb.log({"loss": loss.item(),"loss_reg":loss_reg.item()})

            # print statistics
            # print(loss.sum().shape)
            running_loss += loss.item()
            running_loss_reg += torch.norm(f_der/f.view(f.shape[0],1)+ net.loss_reg,dim=1).sum().item()
            # if i % 100 == 99:    # print every 2000 mini-batches
            # if((i+1)%rate==0):
        
        outputs = net(x_test)
        acc = ((outputs>0.5).squeeze().cpu() == y_test.squeeze()).sum()/(len(y_test_tensor))
    

        
        print("Test Accuracy: ",acc)
        # wandb.log({"test acc": acc})
            
        



        
            
            

        # Assuming you have a trained model called 'model' and test data called 'X_test' and 'y_test'

        # Create an empty list to store the accuracies
        
        # Iterate over each epoch
        
            # Perform forward pass on the test data
            
        

        # Plot the test accuracies over epochs
       
                    
            #         print("Epoch: ",epoch + 1,"Loss: " ,running_loss /(rate*trainloader.batch_size),"Reg Loss: ",running_loss_reg /(rate*trainloader.batch_size))
            #         running_loss = 0.0
            #         running_loss_reg = 0.0

In [29]:
y_train_tensor

tensor([0., 0., 1.,  ..., 0., 0., 0.])

In [30]:
net = Net_new(1,device = torch.device('cuda:3'))
trainloader = torch.utils.data.DataLoader(list(zip(x_train_tensor, y_train_tensor)), batch_size=4096, shuffle=False)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
net = net.to(device=torch.device('cuda:3'))
# wandb.init(project='Adult_income_train')


In [31]:
train_model_priv(net,trainloader,x_test_tensor,y_test_tensor,optimizer,1,0.8,device= torch.device('cuda:3'),print_cond = True,only_reg_flag=0,lr_schedular =None,lambda_loss=1)

tensor(1.4966) tensor(0.8313)
tensor(1.1418) tensor(0.4782)
tensor(0.9986) tensor(0.3351)
tensor(0.9669) tensor(0.3050)
tensor(0.9249) tensor(0.2618)
tensor(0.8599) tensor(0.1979)
tensor(0.8632) tensor(0.2000)
tensor(0.8553) tensor(0.1939)
tensor(0.8466) tensor(0.1843)
Test Accuracy:  tensor(0.7582)


In [32]:
outputs = net(x_test_tensor)

In [33]:
outputs

tensor([[[0.4717]],

        [[0.4643]],

        [[0.4649]],

        ...,

        [[0.4693]],

        [[0.4692]],

        [[0.4672]]], device='cuda:3', grad_fn=<SigmoidBackward0>)

In [34]:
((outputs>0.5).squeeze().cpu() == y_test_tensor.squeeze()).sum()/(len(y_test_tensor))

tensor(0.7582)

In [38]:
1 - y_test_tensor.sum()/(len(y_test_tensor))

tensor(0.7607)

In [39]:
def create_model_embs2(net,trainloader,device= torch.device('cpu'),l=0,h=0.82):
    alpha =1/l;
    
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    X_emb = torch.zeros(l,68)
    losses = torch.zeros(l)
    bs = trainloader.batch_size


    net = net.to(device)
    # criterion = nn.CrossEntropyLoss()

        
        
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
   
        inputs = data[0].to(device)
        n = len(inputs)
        d = inputs.shape[1]
        inputs.requires_grad = True
        labels = data[1].to(device)
        f = py_kde(inputs,inputs,h,device=device)


        f_der = py_kde_der(f,inputs,device=device)
      
        ci = CI_KDE(f,n,h,d,alpha,device=device)

        output =  net(inputs)
        
        loss =torch.max(torch.linalg.norm(f_der/(f-ci).view(f.shape[0],1)+net.loss_reg,dim=1),torch.linalg.norm(f_der/(f+ci).view(f.shape[0],1)+net.loss_reg,dim=1)) 
        try:
            losses[i*bs:i*bs+len(loss)] =loss.detach().cpu()
        except:
            print(loss.detach().cpu().shape)
            print(len(loss))
            print(net.y.detach().cpu().shape)
            print(i*bs)
            print(X_emb[i*bs:i*bs+len(loss)].shape)
        X_emb[i*bs:i*bs+len(loss)] = torch.squeeze(net.y.detach().cpu())
    return(X_emb,losses)

In [40]:
def train_emb(model, train_loader, x_test,y_test,loss_fn, optimizer, num_epochs=10,device=torch.device('cpu'),test_total_loader = None,max_steps =10000):
    running_loss = 0.0
    counter = 0
    max_test_acc =0.0
    model = model.to(device)
    steps = 0
    for epoch in range(num_epochs):
        
        
        for i, data in enumerate(train_loader, 0):
            
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = loss_fn(outputs, labels)
            
            loss.backward()
            
            optimizer.step()
            running_loss += loss.item()
            # wandb.log({"loss": loss.item()})
            
            steps+=1
            
            # counter =0
        # if((epoch+1)%10==0):
            # print('Epoch [%d], loss: %.3f' % (epoch + 1, running_loss /(10* len(train_loader))))
            # running_loss = 0.0
        # for params in model.parameters():
        #     print(params.grad)
        with torch.no_grad():
            x_test = x_test.to(device)
            outputs = model(x_test).cpu()
            
            acc = ((outputs>0.5).squeeze().cpu() == y_test.squeeze()).sum()/(len(y_test))
            print("Test Accuracy: ",acc)
        # wandb.log({"train acc": acc})
       
        
        




In [41]:
def CI_KDE(p_x,n,h,d,alpha,device = torch.device('cuda')):
    return( stats.norm.ppf(1-alpha/2)*torch.sqrt(p_x/((2**d)*math.sqrt(torch.pi**d)*n*h**(d))).to(device) )

def CI_KDE_der(p_x_der,p_x,n,h,d,alpha,device = torch.device('cuda')):
    return( p_x_der*stats.norm.ppf(1-alpha/2)*torch.sqrt(1/(p_x.unsqueeze(dim=1)*(2**d)*math.sqrt(torch.pi**d)*n*h**(d))).to(device) )

In [43]:
trainloader = torch.utils.data.DataLoader(list(zip(x_train_tensor, y_train_tensor)), batch_size=512, shuffle=False)
X_emb_train, losses_train = create_model_embs2(net,trainloader,device= torch.device('cuda:3'),l=len(y_train_tensor),h=0.8)

In [46]:

# x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
x_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
testloader = torch.utils.data.DataLoader(list(zip(x_test_tensor, y_test_tensor)), batch_size=512, shuffle=False)
X_emb_test, losses_test = create_model_embs2(net,testloader,device= torch.device('cuda:3'),l=len(y_test_tensor),h=0.8)

In [5]:
losses_train,indices = torch.sort(losses_train)
set_eps = 5
ind = (losses_train < set_eps).sum()

    
    

X_emb_train_priv = X_emb_train[indices][:ind]
Y_train = y_train_tensor[indices][:ind]

NameError: name 'torch' is not defined

In [48]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
trainloader_priv = torch.utils.data.DataLoader(list(zip(X_emb_train_priv, Y_train)), batch_size=512, shuffle=True)

train_emb(model,trainloader_priv,X_emb_test,y_test_tensor,nn.BCELoss(),optimizer,5,device=torch.device('cuda:0'),test_total_loader = None,max_steps =10000)

Test Accuracy:  tensor(0.6566)
Test Accuracy:  tensor(0.6853)
Test Accuracy:  tensor(0.6847)
Test Accuracy:  tensor(0.6855)
Test Accuracy:  tensor(0.6878)


In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score




knn = KNeighborsClassifier(n_neighbors=10)


knn.fit(X_emb_train_priv, Y_train)

y_pred = knn.predict(X_emb_test)
(y_pred == np.asarray(y_test_tensor)).sum()/len(y_pred)





: 

In [177]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10,max_depth=10,  random_state=0)
clf.fit(x_train_tensor,y_train_tensor)
print((clf.predict(x_test_tensor) == np.asarray(y_test_tensor)).sum()/len(y_test_tensor))

0.7830472103004292


In [122]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


knn = KNeighborsClassifier(n_neighbors=20)

knn.fit(x_train_tensor, y_train_tensor)


y_pred = knn.predict(x_test_tensor)
(y_pred == np.asarray(y_test_tensor)).sum()/len(y_pred)

0.7729613733905579

In [184]:
model

LogisticRegression(
  (linear): Linear(in_features=68, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [195]:
input_size = 68
model = LogisticRegression(input_size)
from opacus import PrivacyEngine
privacy_engine = PrivacyEngine()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model2, optimizer2, data_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=trainloader,
    target_epsilon=1,
    target_delta =0.0001,
    epochs = 5,
    max_grad_norm=1.0,
)
train_emb(model2,data_loader,x_test_tensor,y_test_tensor,nn.BCELoss(),optimizer2,5,device=torch.device('cuda:0'),max_steps =10000)

Test Accuracy:  tensor(0.6155)
Test Accuracy:  tensor(0.7054)
Test Accuracy:  tensor(0.7109)
Test Accuracy:  tensor(0.7135)
Test Accuracy:  tensor(0.7187)
