In [1]:
import os

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [2]:
torch.cuda.is_available()

True

In [13]:
class Dataset(torch.utils.data.Dataset):
    def int_to_onehot(self, indx):
        one_hot = torch.zeros(range_y).float()
        one_hot[int(indx) - int(min_y)] = 1.0  # Adjusting class labels to be zero-based
        return one_hot

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.int_to_onehot(self.y[idx])

def train_model(net, X_train, Y_train, X_val, Y_val):
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=1e-3)

    ds = Dataset(X_train, Y_train)
    train_loader = torch.utils.data.DataLoader(ds, batch_size=64, shuffle=True)
    net = net.float()

    for epoch in range(100):
        running_loss = 0.0

        for i, batch in enumerate(train_loader):
            curr_x, curr_y = batch
            optimizer.zero_grad()
            outputs = net(curr_x.to(device).float())  # Forward pass
            curr_y = torch.max(curr_y, 1)[1]  # Target labels conversion to 1D
            loss = loss_func(outputs, curr_y.to(device))
            # Backpropagation and network parameters' update.
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            _, pred = torch.max(outputs, -1)

        train_output = torch.argmax(netX(torch.Tensor(X_train).to(device).float()).detach().cpu(),1)
        train_acc = accuracy_score(train_output, Y_train)
        validation_output = torch.argmax(netX(torch.Tensor(X_val).to(device).float()).detach().cpu(),1)
        validation_acc = accuracy_score(validation_output, Y_val)

        # if (epoch == 99):
        #     print("Epoch : {}, train acc : {}, train loss : {}, validation acc : {}".format(epoch + 1, train_acc, running_loss, validation_acc))
        # running_loss = 0.00
    
    return optimizer, loss_func


class NetX(nn.Module):

    def __init__(self):
        super(NetX, self).__init__()
        layers = []
        layers_count = np.random.randint(1, 7)
        layer_neuron_size = list(range(100, 1100, 100))
        previous_layer_size = X.shape[1]
        
        for i in range(layers_count):
            layer_size = np.random.choice(layer_neuron_size, 1)[0]
            
            norm = np.random.random()
            
            if norm <= 0.33:
                layers.append(nn.Dropout())
            elif norm <= 0.66:
                layers.append(nn.BatchNorm1d(previous_layer_size))
                        
            layers.append(nn.Linear(previous_layer_size, layer_size))
            
            activation = np.random.random()

                  
            if activation < 0.05:
                layers.append(nn.Tanh())
            elif activation < 0.1:
                layers.append(nn.Sigmoid())
            else: 
                # TODO: Original ReLU usage causes NaNs in net_ac, layer_ac. Compare Leaky ReLU to ELU!
                # ELU works fine, evaluating SiLU [aka Swish = x * sigmoid(x)]
                # layers.append(nn.ReLU())
                #layers.append(nn.ELU())
                layers.append(nn.SiLU())
                
                
                
            previous_layer_size = layer_size
                
        layers.append(nn.Linear(previous_layer_size, int(range_y)))
        layers.append(nn.Softmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)     

In [21]:
# Ordered by size, increasing order 
dataset_names=[
    # 'rmftsa_sleepdata',
    # 'diggle_table_a2',
    # 'no2',
    # 'pm10',
    # 'disclosure_z',
    # 'diabetes',
    # 'mfeat-morphological',
    # 'cardiotocography',
    # 'nursery',
    # 'phoneme',
    ## 'bank-marketing',
    #'kr-vs-k',
    ##'kropt',
    ##'jungle_chess_2pcs_raw_endgame_complete',
    '2dplanes',
    'tamilnadu-electricityarff',
    'house_8L',
    'mfeat-karhunen',
    'amazon_employee_access',
    'satimage',
    'ailerons',
    'fried',
    'first-order-theorem-proving',
    'electricity-normalized',
    'CreditCardSubset',    
    'airlines',    
    'nomao',               
    # 'mnist',
    # 'cifar-100',
    # 'cifar-10',        
    # 'svhn',    
]





In [22]:
for dataset_name in dataset_names:
  print("==================================================================================")
  print("Starting dataset ", dataset_name)
  if not dataset_name in ['cifar-10', 'cifar-100', 'mnist', 'svhn']:
    PATH = 'C:/Users/idopa/Documents/BGU/MSc/SPECTRA-CompressionAgent/datasets/NEON-csv/'
      
  else:
    continue
    #PATH = 'C:/Users/idopa/Documents/BGU/MSc/SPECTRA-CompressionAgent/datasets/SPECTRA-csv/'
  
  df = pd.read_csv(PATH+dataset_name+'.csv')
  
  try:
    for col in df.columns:
        if col in ['binaryClass', 'unemployed']:
            df[col] = df[col].replace({'P': 1, 'N': 0})
        df[col] = df[col].astype(float)
  except ValueError:  # non-numeric column, skip for now
    print(f"dataset {dataset_name} was skipped due to non-numeric column")
    continue
      
  PATH += f'{dataset_name}/'
  os.mkdir(PATH)    

  dataset = df.values
  X = dataset[:,0:dataset.shape[1] - 1]
  Y = dataset[:,dataset.shape[1] - 1]

  min_y = min(Y)
  max_y = max(Y)

  range_y = int(max_y - min_y + 1)

  min_max_scaler = preprocessing.MinMaxScaler()
  X_scale = min_max_scaler.fit_transform(X)
 
  X_train, X_val, Y_train, Y_val = train_test_split(X_scale, Y, test_size=0.3, random_state=0)
  
  pd.DataFrame(X_train).to_csv(PATH + "./X_train.csv", index = False)
  pd.DataFrame(X_val).to_csv(PATH + "./X_val.csv", index = False)
  pd.DataFrame(Y_train).to_csv(PATH + "./Y_train.csv", index = False)
  pd.DataFrame(Y_val).to_csv(PATH + "./Y_val.csv", index = False)

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  for i in range(5):
      print("Starting net ", i+1)
      netX = NetX()
      netX.to(device)
      opt, l = train_model(netX, X_train, Y_train, X_val, Y_val)

      checkpoint = {'model': netX,
                    'state_dict': netX.state_dict(),
                    'optimizer' : opt.state_dict(),
                    'mission_type': "Classification",
                    'loss': l  
                  }
                    
      torch.save(checkpoint, PATH + 'netX{}model.pt'.format(i+1))
      print("Saved net ", i+1)
      # print("===================================================================")



Starting dataset  kropt
dataset kropt was skipped due to non-numeric column
Starting dataset  jungle_chess_2pcs_raw_endgame_complete
dataset jungle_chess_2pcs_raw_endgame_complete was skipped due to non-numeric column
Starting dataset  2dplanes
Starting net  1
Saved net  1
Starting net  2
Saved net  2
Starting net  3
Saved net  3
Starting net  4
Saved net  4
Starting net  5
Saved net  5
Starting dataset  tamilnadu-electricityarff
Starting net  1
Saved net  1
Starting net  2
Saved net  2
Starting net  3
Saved net  3
Starting net  4
Saved net  4
Starting net  5
Saved net  5
Starting dataset  house_8L
Starting net  1
Saved net  1
Starting net  2
Saved net  2
Starting net  3
Saved net  3
Starting net  4
Saved net  4
Starting net  5
Saved net  5
Starting dataset  mfeat-karhunen
Starting net  1
Saved net  1
Starting net  2
Saved net  2
Starting net  3
Saved net  3
Starting net  4
Saved net  4
Starting net  5
Saved net  5
Starting dataset  amazon_employee_access
Starting net  1
Saved net  1
S

In [18]:
# import os
# import zipfile

# def zipdir(path, ziph):
#     # ziph is zipfile handle
#     for root, dirs, files in os.walk(path):
#         for file in files:
#             ziph.write(os.path.join(root, file))

# zipf = zipfile.ZipFile('/content/drive/My Drive/Fully Connected Training/Classification/Classification.zip', 'w', zipfile.ZIP_DEFLATED)
# zipdir('/content/drive/My Drive/Fully Connected Training/Classification/', zipf)
# zipf.close()