In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [14]:
torch.cuda.is_available()

True

In [15]:
class Dataset(torch.utils.data.Dataset):
    def int_to_onehot(self, indx):
        one_hot = torch.zeros(range_y).float()
        one_hot[int(indx) - int(min_y)] = 1.0  # Adjusting class labels to be zero-based
        return one_hot

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.int_to_onehot(self.y[idx])

def train_model(net, X_train, Y_train, X_val, Y_val):
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=1e-3)

    ds = Dataset(X_train, Y_train)
    train_loader = torch.utils.data.DataLoader(ds, batch_size=64, shuffle=True)
    net = net.float()

    for epoch in range(100):
        running_loss = 0.0

        for i, batch in enumerate(train_loader):
            curr_x, curr_y = batch
            optimizer.zero_grad()
            outputs = net(curr_x.to(device).float())  # Forward pass
            curr_y = torch.max(curr_y, 1)[1]  # Target labels conversion to 1D
            loss = loss_func(outputs, curr_y.to(device))
            # Backpropagation and network parameters' update.
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            _, pred = torch.max(outputs, -1)

        train_output = torch.argmax(netX(torch.Tensor(X_train).to(device).float()).detach().cpu(),1)
        train_acc = accuracy_score(train_output, Y_train)
        validation_output = torch.argmax(netX(torch.Tensor(X_val).to(device).float()).detach().cpu(),1)
        validation_acc = accuracy_score(validation_output, Y_val)

        # if (epoch == 99):
        #     print("Epoch : {}, train acc : {}, train loss : {}, validation acc : {}".format(epoch + 1, train_acc, running_loss, validation_acc))
        # running_loss = 0.00
    
    return optimizer, loss_func


class NetX(nn.Module):

    def __init__(self):
        super(NetX, self).__init__()
        layers = []
        layers_count = np.random.randint(1, 7)
        layer_neuron_size = list(range(100, 1100, 100))
        previous_layer_size = X.shape[1]
        
        for i in range(layers_count):
            layer_size = np.random.choice(layer_neuron_size, 1)[0]
            
            norm = np.random.random()
            
            if norm <= 0.33:
                layers.append(nn.Dropout())
            elif norm <= 0.66:
                layers.append(nn.BatchNorm1d(previous_layer_size))
                        
            layers.append(nn.Linear(previous_layer_size, layer_size))
            
            activation = np.random.random()

                  
            if activation < 0.05:
                layers.append(nn.Tanh())
            elif activation < 0.1:
                layers.append(nn.Sigmoid())
            else: 
                # TODO: Original ReLU usage causes NaNs in net_ac, layer_ac. Compare Leaky ReLU to ELU!
                # layers.append(nn.ReLU())
                layers.append(nn.ELU())
                
            previous_layer_size = layer_size
                
        layers.append(nn.Linear(previous_layer_size, int(range_y)))
        layers.append(nn.Softmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)     

In [16]:
dataset_names=[
  # '‪analcatdata_lawsuit‬‏',
  # 'ar4',
  # 'baseball',
  #'bodyfat',
  # 'chatfield_4',
  # 'chscase_vine1',
  # 'diabetes',
  # 'diggle_table_a2',
  # 'disclosure_z',
  # 'elusage',
  # 'fri_c0_250_5',
  # 'kc3',
  # 'lowbwt',
  # 'lupus',
  # # Important - with many classes!!
  #'mfeat-karhunen',
  #'mfeat-morphological',
  #'no2',
  #'pm10',
  #'prnn_synth',
  #'rabe_131',
  #'rmftsa_sleepdata',
  #'schlvote',
  #'tae',
  'teachingAssistant',
  #'transplant',
  #'triazines',
  #'veteran'
]





In [17]:
for dataset_name in dataset_names:
  print("==================================================================================")
  print("Starting dataset ", dataset_name)
  PATH = 'C:/Users/idopa/Documents/BGU/MSc/CNN-CompressionAgent/datasets/' + dataset_name + '/'

  df = pd.read_csv(PATH+dataset_name+'.csv')

  if dataset_name == 'bodyfat':
    df['binaryClass'] = df['binaryClass'].replace({'P': 1, 'N': 0})

  for col in df.columns:
    df[col] = df[col].astype(float)


  dataset = df.values
  X = dataset[:,0:dataset.shape[1] - 1]
  Y = dataset[:,dataset.shape[1] - 1]

  min_y = min(Y)
  max_y = max(Y)

  range_y = int(max_y - min_y + 1)

  min_max_scaler = preprocessing.MinMaxScaler()
  X_scale = min_max_scaler.fit_transform(X)
 
  X_train, X_val, Y_train, Y_val = train_test_split(X_scale, Y, test_size=0.3, random_state=0)
  
  pd.DataFrame(X_train).to_csv(PATH + "./X_train.csv", index = False)
  pd.DataFrame(X_val).to_csv(PATH + "./X_val.csv", index = False)
  pd.DataFrame(Y_train).to_csv(PATH + "./Y_train.csv", index = False)
  pd.DataFrame(Y_val).to_csv(PATH + "./Y_val.csv", index = False)

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  for i in range(30):
      print("Starting net ", i)
      netX = NetX()
      netX.to(device)
      opt, l = train_model(netX, X_train, Y_train, X_val, Y_val)

      checkpoint = {'model': netX,
                    'state_dict': netX.state_dict(),
                    'optimizer' : opt.state_dict(),
                    'mission_type': "Classification",
                    'loss': l  
                  }
                    
      torch.save(checkpoint, PATH + 'netX{}model.pt'.format(i))
      print("Saved net ", i)
      # print("===================================================================")



Starting dataset  teachingAssistant
Starting net  0
Saved net  0
Starting net  1
Saved net  1
Starting net  2
Saved net  2
Starting net  3
Saved net  3
Starting net  4
Saved net  4
Starting net  5
Saved net  5
Starting net  6
Saved net  6
Starting net  7
Saved net  7
Starting net  8
Saved net  8
Starting net  9
Saved net  9
Starting net  10
Saved net  10
Starting net  11
Saved net  11
Starting net  12
Saved net  12
Starting net  13
Saved net  13
Starting net  14
Saved net  14
Starting net  15
Saved net  15
Starting net  16
Saved net  16
Starting net  17
Saved net  17
Starting net  18
Saved net  18
Starting net  19
Saved net  19
Starting net  20
Saved net  20
Starting net  21
Saved net  21
Starting net  22
Saved net  22
Starting net  23
Saved net  23
Starting net  24
Saved net  24
Starting net  25
Saved net  25
Starting net  26
Saved net  26
Starting net  27
Saved net  27
Starting net  28
Saved net  28
Starting net  29
Saved net  29


In [18]:
# import os
# import zipfile

# def zipdir(path, ziph):
#     # ziph is zipfile handle
#     for root, dirs, files in os.walk(path):
#         for file in files:
#             ziph.write(os.path.join(root, file))

# zipf = zipfile.ZipFile('/content/drive/My Drive/Fully Connected Training/Classification/Classification.zip', 'w', zipfile.ZIP_DEFLATED)
# zipdir('/content/drive/My Drive/Fully Connected Training/Classification/', zipf)
# zipf.close()