# generate data

In [11]:
import scipy
from scipy.special import expit, logit
import numpy as np
import math
from torch import nn
from torch.nn import functional as F
import torch
import copy
import random
import pandas as pd

from sklearn.metrics import roc_auc_score

In [2]:
# simple binary classifier
class BinaryClassification(nn.Module):
    def __init__(self, input_shape):
        super(BinaryClassification, self).__init__()
        self.layer_1 = nn.Linear(input_shape, 10) 
        self.layer_out = nn.Linear(10, 1) 

        
    def forward(self, inputs):
        x = torch.relu(self.layer_1(inputs))
        x = torch.relu(self.layer_out(x))
        
        return x

In [3]:
def train(X, Y, model, loss_fn, optimizer, epochs=5000, train_prop=1.0):
    losses = []

    # split into training and validation
    n = len(Y)
    X_train = X
    Y_train = Y
    X_validation = None
    Y_validation = None
    
    # split into train and validation
    if train_prop < 1.0:
        xy = torch.cat((torch.unsqueeze(Y,1),X),axis=1)        
        train_cutoff = int(n * train_prop)
        train = xy[:train_cutoff,:]
        Y_train, X_train =  train[:,0], train[:,1:]
        
        validation = xy[train_cutoff:,:]
        Y_validation, X_validation = validation[:,0], validation[:, 1:]

    
    min_validation_loss = test = float("inf")
    min_validation_model = None
    
    for i in range(epochs):

        #calculate output
        train_output = model(X_train)
        
        if train_prop < 1.0:
            # calculate loss on validation set
            # if it is smaller than the minimum validation loss, save the parameters
            validation_output = model(X_validation)
            validation_loss = loss_fn(validation_output, Y_validation.reshape(-1,1))
            if validation_loss < min_validation_loss:
                min_validation_model = copy.deepcopy(model)
                min_validation_loss = validation_loss
        
        #calculate loss
        train_loss = loss_fn(train_output, Y_train.reshape(-1,1))

        #backprop
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        losses.append(train_loss)
    
    # if no validation, return the final model
    if train_prop == 1.0:
        min_validation_model = model
        
    return(min_validation_model)

In [4]:

# train source model, target model w/o TL, target model w/ TL
def training_loop(X_source_train, Y_source_train,
                  X_source_test, Y_source_test,
                  X_target_train, Y_target_train,
                  X_target_test, Y_target_test):
    random.seed(1978)
    torch.manual_seed(1978)

    learning_rate = 0.0001
    train_prop = 0.75
    
    
    # Model , Optimizer, Loss
    ###
    # train source
    ###
    source_model = BinaryClassification(input_shape=X_source_train.shape[1])
    optimizer = torch.optim.Adam(source_model.parameters(),lr=learning_rate)
    loss_fn = nn.BCEWithLogitsLoss()

    source_model = train(X_source_train, Y_source_train, source_model, loss_fn, optimizer, train_prop = train_prop)
    
    source_roc_auc = roc_auc_score(Y_source_test, source_model(X_source_test).detach().numpy())
    print(f"source auc:\t{source_roc_auc}")


    
    ###
    # train target w/o TL
    ###
    target_model = BinaryClassification(input_shape=X_target_train.shape[1])
    optimizer = torch.optim.Adam(target_model.parameters(),lr=learning_rate)
    loss_fn = nn.BCEWithLogitsLoss()

    target_model = train(X_target_train, Y_target_train, target_model, loss_fn, optimizer, train_prop = train_prop)

    target_roc_auc = roc_auc_score(Y_target_test, target_model(X_target_test).detach().numpy())
    print(f"target auc:\t{target_roc_auc}")


    ###
    # train target w/ TL
    ###
    
    target_model_tl = copy.deepcopy(source_model)
    # freeze layers by so the weights do not update
    for param in target_model_tl.parameters():
        param.requires_grad = False

    # reassing last layer with requires_grad=true by default
    target_model_tl.layer_out = nn.Linear(target_model_tl.layer_out.in_features, target_model_tl.layer_out.out_features)

    optimizer = torch.optim.Adam(target_model_tl.parameters(),lr=learning_rate)
    loss_fn = nn.BCEWithLogitsLoss()

    target_model_tl = train(X_target_train, Y_target_train, target_model_tl, loss_fn, optimizer, train_prop = train_prop)

    target_tl_roc_auc = roc_auc_score(Y_target_test, target_model_tl(X_target_test).detach().numpy())
    print(f"target TL auc:\t{target_tl_roc_auc}") 
    
    return(source_model,
          target_model,
          target_model_tl)

In [5]:
def generate_data_uniform(p, n, min_b, mean_b, meanX, sdX, train_prop = 1.0, seed = None):
    
    if seed:
        random.seed(seed)
        torch.manual_seed(seed)
    
    # generate uniform b's
    low = min_b
    high = (2*mean_b) - min_b
    b = np.random.uniform(low, high, size=(p,1))
    # normal X's
    X = torch.from_numpy(np.random.normal(meanX, sdX, size=(n, p))).float()
    
    # logistic Y's
    Y = torch.from_numpy(np.random.binomial(1, expit( np.matmul(X,b)))).float()
    
    # split into training and test
    X_train = X
    Y_train = Y
    X_test = None
    Y_test = None
    
    if train_prop < 1.0:
        xy = torch.cat((Y,X),axis=1)        
        train_cutoff = int(n * train_prop)
        train = xy[:train_cutoff,:]
        Y_train, X_train =  train[:,0], train[:,1:]
        
        test = xy[train_cutoff:,:]
        Y_test, X_test = test[:,0], test[:, 1:]
    
    
    return Y_train, X_train, Y_test, X_test

In [6]:
def generate_data_normal(p, n, mean_b, sd_b, meanX, sdX, train_prop = 1.0, seed=None):
    
    if seed:
        random.seed(seed)
        torch.manual_seed(seed)
    
    # generate normal b's
    b = np.random.normal(mean_b, sd_b, size=(p,1))
    # normal X's
    X = torch.from_numpy(np.random.normal(meanX, sdX, size=(n, p))).float()
    
    # logistic Y's
    Y = torch.from_numpy(np.random.binomial(1, expit( np.matmul(X,b)))).float()
    
    # split into training and test
    X_train = X
    Y_train = Y
    X_test = None
    Y_test = None


    
    if train_prop < 1.0:
        xy = torch.cat((Y,X),axis=1)        
        train_cutoff = int(n * train_prop)
        train = xy[:train_cutoff,:]
        Y_train, X_train =  train[:,0], train[:,1:]
        
        test = xy[train_cutoff:,:]
        Y_test, X_test = test[:,0], test[:, 1:]
    
    
    return Y_train, X_train, Y_test, X_test

# TESTS!

### same source and target, uniform

In [21]:
seed = 1978

p = 60

# source params
n_source = 1000
min_b_source = 0.5
mean_b_source = 10.0
meanX_source = 0.0
sdX_source = 1.0
train_prop_source = 0.80

# target params
n_target = 200
min_b_target = 0.5
mean_b_target = 10.0
meanX_target = 0.0
sdX_target = 1.0
train_prop_target = 0.50


Y_source_train, X_source_train, Y_source_test, X_source_test = generate_data_uniform(p = p,
                                                n = n_source,
                                                 min_b = min_b_source,
                                                 mean_b = mean_b_source,
                                                 meanX =meanX_source,
                                                 sdX = sdX_source,
                                                 train_prop = train_prop_source,
                                                seed = seed)

    
Y_target_train, X_target_train, Y_target_test, X_target_test = generate_data_uniform(p = p,
                                                                n = n_target,
                                                                 min_b = min_b_target,
                                                                 mean_b = mean_b_target,
                                                                 meanX =meanX_target,
                                                                 sdX = sdX_target,
                                                                 train_prop = train_prop_target,
                                                                seed = seed)


source_model, target_model, target_tl_model = training_loop(X_source_train, Y_source_train,
                  X_source_test, Y_source_test,
                  X_target_train, Y_target_train,
                  X_target_test, Y_target_test)

source auc:	0.9894212185942426
target auc:	0.6841787439613527
target TL auc:	0.818840579710145


In [18]:
X_source_train

tensor([[-0.4945, -0.5697,  1.4653,  ...,  1.2432,  0.9389, -0.4801],
        [-1.0910, -0.6389,  0.8532,  ..., -1.1853,  0.4606,  0.8669],
        [-0.8857,  1.1082, -2.2344,  ..., -0.6588,  0.1089,  1.4029],
        ...,
        [ 0.0568, -1.1793,  0.3054,  ..., -2.5236,  0.9449,  1.9006],
        [ 0.1297,  0.4667,  0.2089,  ..., -1.2033,  0.5402, -1.6060],
        [-0.1110,  0.4089,  1.5392,  ...,  0.2708,  0.7136, -0.4332]])

In [17]:
pd.DataFrame(X_source_train.numpy()).to_csv("data/X_source_train.csv", index=False, header=False)
pd.DataFrame(Y_source_train.numpy()).to_csv("data/Y_source_train.csv", index=False, header=False)
pd.DataFrame(X_source_test.numpy()).to_csv("data/X_source_test.csv", index=False, header=False)
pd.DataFrame(Y_source_test.numpy()).to_csv("data/Y_source_test.csv", index=False, header=False)
#
pd.DataFrame(X_target_train.numpy()).to_csv("data/X_target_train.csv", index=False, header=False)
pd.DataFrame(Y_target_train.numpy()).to_csv("data/Y_target_train.csv", index=False, header=False)
pd.DataFrame(X_target_test.numpy()).to_csv("data/X_target_test.csv", index=False, header=False)
pd.DataFrame(Y_target_test.numpy()).to_csv("data/Y_target_test.csv", index=False, header=False)

In [20]:
X_source_test

tensor([[ 0.5505,  1.5966, -1.7085,  ...,  0.9672,  0.2403, -0.2485],
        [ 0.0170,  0.4634, -0.2252,  ...,  0.3834, -0.2627,  1.1167],
        [-0.7401, -2.3216,  2.5885,  ...,  0.2312, -0.5645,  1.1889],
        ...,
        [ 1.3207,  1.7277, -1.1563,  ..., -0.3781, -0.2164, -0.0283],
        [-0.1596, -0.3747,  0.8926,  ...,  1.0048, -0.5351, -1.6187],
        [-2.0624,  0.5458, -1.2427,  ...,  0.0154, -0.3726, -0.6892]])

In [222]:
t_np = t.numpy() #convert to Numpy array
df = pd.DataFrame(t_np) #convert to a dataframe
df.to_csv("testfile",index=False) #save to file


In [223]:
# for param in target_tl_model.parameters():
#     print(param)

### different parameterizations, both uniform
change in mean_b_target

In [226]:
seed = 1978

p = 60

# source params
n_source = 1000
min_b_source = 0.5
mean_b_source = 5.0
meanX_source = 0.0
sdX_source = 1.0
train_prop_source = 0.80

# target params
n_target = 200
min_b_target = 0.5
mean_b_target = 5.0
meanX_target = 0.0
sdX_target = 1.0
train_prop_target = 0.50


Y_source_train, X_source_train, Y_source_test, X_source_test = generate_data_uniform(p = p,
                                                n = n_source,
                                                 min_b = min_b_source,
                                                 mean_b = mean_b_source,
                                                 meanX =meanX_source,
                                                 sdX = sdX_source,
                                                 train_prop = train_prop_source,
                                                seed = seed)
                                                         
    
Y_target_train, X_target_train, Y_target_test, X_target_test = generate_data_uniform(p = p,
                                                                n = n_target,
                                                                 min_b = min_b_target,
                                                                 mean_b = mean_b_target,
                                                                 meanX =meanX_target,
                                                                 sdX = sdX_target,
                                                                 train_prop = train_prop_target,
                                                                seed = seed)


source_model, target_model, target_tl_model = training_loop(X_source_train, Y_source_train,
                  X_source_test, Y_source_test,
                  X_target_train, Y_target_train,
                  X_target_test, Y_target_test)

source auc:	0.9897836538461539
target auc:	0.6926770708283314
target TL auc:	0.803921568627451


### source and target normal

In [228]:
seed = 1978
p = 60

# source params
n_source = 1000
mean_b_source = 0.0
sd_b_source = 3.0
meanX_source = 0.0
sdX_source = 1.0
train_prop_source = 0.80

# target params
n_target = 200
mean_b_target = 0.0
sd_b_target = 3.0
meanX_target = 0.0
sdX_target = 1.0
train_prop_target = 0.50


Y_source_train, X_source_train, Y_source_test, X_source_test = generate_data_normal(p = p,
                                                n = n_source,
                                                 mean_b = mean_b_source,
                                                 sd_b =sd_b_source,
                                                 meanX =meanX_source,
                                                 sdX = sdX_source,
                                                 train_prop = train_prop_source,
                                                seed = seed)
                                                         
    
Y_target_train, X_target_train, Y_target_test, X_target_test = generate_data_normal(p = p,
                                                                n = n_target,
                                                                 mean_b = mean_b_target,
                                                                 sd_b = sd_b_target,
                                                                 meanX =meanX_target,
                                                                 sdX = sdX_target,
                                                                 train_prop = train_prop_target,
                                                                seed = seed)


source_model, target_model, target_tl_model = training_loop(X_source_train, Y_source_train,
                  X_source_test, Y_source_test,
                  X_target_train, Y_target_train,
                  X_target_test, Y_target_test)

source auc:	0.9729770531400965
target auc:	0.5390499194847022
target TL auc:	0.6167471819645732


### source normal target uniform

In [231]:
seed = 1978
p = 60

# source params
n_source = 1000
mean_b_source = 0.0
sd_b_source = 3.0
meanX_source = 0.0
sdX_source = 1.0
train_prop_source = 0.80

# target params
n_target = 200
mean_b_target = 0.0
sd_b_target = 3.0
meanX_target = 0.0
sdX_target = 1.0
train_prop_target = 0.50


n_target = 200
min_b_target = 0.5
mean_b_target = 5.0
meanX_target = 0.0
sdX_target = 1.0
train_prop_target = 0.50

Y_source_train, X_source_train, Y_source_test, X_source_test = generate_data_normal(p = p,
                                                n = n_source,
                                                 mean_b = mean_b_source,
                                                 sd_b =sd_b_source,
                                                 meanX =meanX_source,
                                                 sdX = sdX_source,
                                                 train_prop = train_prop_source,
                                                seed = seed)

Y_target_train, X_target_train, Y_target_test, X_target_test = generate_data_uniform(p = p,
                                                                n = n_target,
                                                                 min_b = min_b_target,
                                                                 mean_b = mean_b_target,
                                                                 meanX =meanX_target,
                                                                 sdX = sdX_target,
                                                                 train_prop = train_prop_target,
                                                                seed = seed)


source_model, target_model, target_tl_model = training_loop(X_source_train, Y_source_train,
                  X_source_test, Y_source_test,
                  X_target_train, Y_target_train,
                  X_target_test, Y_target_test)

source auc:	0.962155388471178
target auc:	0.6787439613526571
target TL auc:	0.48731884057971014
