In [13]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils import data

from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler, Sampler

# Functions for dataset preparation
Data is joined by 1 hour intervals

In [14]:
def get_data(ndays):
    path_in = '../../ionosphere_dataset/NOAA/NOAA_datasets_for_ML/dataset_NN_model/sondes_in_ml.csv'
    df_inside = pd.read_csv(path_in, sep=',')

    path_out = '../../ionosphere_dataset/NOAA/NOAA_datasets_for_ML/dataset_NN_model/sondes_year_before_after_ml.csv'
    df_outside = pd.read_csv(path_out, sep=',')
    print(df_inside.id.unique().shape[0],df_outside.id.unique().shape[0])

    df_inside.nday = df_inside.nday.astype('int')
    df_inside = df_inside[(df_inside.nday < ndays)]

    df_outside.nday = df_outside.nday.astype('int')
    df_outside = df_outside[(df_outside.nday < ndays)]

    # join data per hour
    means = df_inside.groupby(['id', 'date', 'nday', 'h']).mean()
    df_inside = means.reset_index()

    means = df_outside.groupby(['id', 'date', 'nday', 'h']).mean()
    df_outside = means.reset_index()
    
    df_inside = df_inside.groupby('id').filter(lambda x: x['D'].count() == 24*ndays)
    df_inside.reset_index(drop=True, inplace=True)
    # dfnew.id.unique().shape

    df_outside = df_outside.groupby('id').filter(lambda x: x['D'].count() == 24*ndays)
    df_outside.reset_index(drop=True, inplace=True)
    
    df_inside['res'] = 1
    df_outside['res'] = 0
    df_inside.reset_index(drop=True, inplace=True)
    df_outside.reset_index(drop=True, inplace=True)
    ds = pd.concat([df_inside, df_outside], sort=True)
    ds = ds.drop(columns=['m', 'date'])
    
    print('dataset size:', ds.id.unique().shape, ds.shape)
    print('negative samples: %f, positive samples: %f'% (df_inside.id.unique().shape[0], df_outside.id.unique().shape[0]))
    return ds
    

def remove_feature(ds, feature):
    return ds.drop(columns = [feature])

def ds_to_tensor(ds):
    grouped_by_id = ds.groupby(['id', 'nday',])
    X = []
    y = []
    for name, g in grouped_by_id:
        X.append(g.drop(columns=['id', 'nday', 'h']).to_numpy())
        y.append(g.res.iloc[0])
#     print('len(X), len(y)', len(X), len(y))
    X = torch.Tensor(X)
    y = np.asarray(y, dtype=np.float32)
    return X, y

class Dataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        # Select sample
        X = self.X[index]
        y = self.y[index]
        return X, y


## Building model and splitting data

In [15]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, 3)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        sample = []
        for eq in x:
            inner_sample = []
            for h in eq:
                inner_sample.append(self.linear1(h.type(torch.FloatTensor)))
            sample.append(torch.cat(inner_sample, dim = 0))
        y_pred = self.linear2(torch.stack(sample, dim = 0))
        return y_pred
    

def split_data(dataset, batch_size):
    data_size = dataset.__len__()
    validation_split = .2
    split = int(np.floor(validation_split * data_size))
    indices = list(range(data_size))
    np.random.shuffle(indices)

    train_indices, val_indices = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                             sampler=val_sampler)
    return train_loader, val_loader

In [16]:
def train_model(model, train_loader, val_loader, loss, optimizer, num_epochs, lr_scheduler = None):    
    loss_history = []
    train_history = []
    val_history = []
    for epoch in range(num_epochs):
        model.train() # Enter train mode
        
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for i_step, (x, y) in enumerate(train_loader):
            prediction = model(x) 
            loss_value = loss(prediction, y.long())
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            _, indices = torch.max(prediction, 1)
            correct_samples += torch.sum(indices == y)
            total_samples += y.shape[0]
            
        if lr_scheduler: 
            lr_scheduler.step()
        train_accuracy = float(correct_samples) / total_samples
        print('correct_samples=', correct_samples, 'total_samples =', total_samples)
        val_accuracy = compute_accuracy(model, val_loader)
        
        loss_history.append(float(loss_accum))
        train_history.append(train_accuracy)
        val_history.append(val_accuracy)
        
        print("loss: %f, Train accuracy: %f, Val accuracy: %f" % (loss_value, train_accuracy, val_accuracy))
        
    return loss_history, train_history, val_history
        
def compute_accuracy(model, loader):
    """
    Computes accuracy on the dataset wrapped in a loader
    
    Returns: accuracy as a float value between 0 and 1
    """
    model.eval() 
    correct = 0
    total = 0
    
    for x, y in loader:
        batch_pred = model(x).argmax(1)
        correct += (batch_pred == y).nonzero().size(0)
        total += y.size(0)     
    acc = correct / total
    return acc


## Example

In [17]:
ds = get_data(ndays = 1)
X, y = ds_to_tensor(ds)
dataset = Dataset(X, y)

train_loader, val_loader = split_data(dataset, batch_size = 64)

D_in = 15
H, D_out = 72, 2
my_model = TwoLayerNet(D_in, H, D_out)
my_model.type(torch.FloatTensor)

loss = nn.CrossEntropyLoss().type(torch.FloatTensor)
optimizer = optim.Adam(my_model.parameters(), lr=0.001)
# lr_schededuler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

loss_history, train_history, val_history = train_model(my_model, train_loader, val_loader, loss, optimizer, num_epochs = 10)

732 1676
dataset size: (1868,) (44832, 18)
negative samples: 547.000000, positive samples: 1321.000000
correct_samples= tensor(879) total_samples = 1495
loss: 2.602733, Train accuracy: 0.587960, Val accuracy: 0.313673
correct_samples= tensor(835) total_samples = 1495
loss: 3.242531, Train accuracy: 0.558528, Val accuracy: 0.710456
correct_samples= tensor(883) total_samples = 1495
loss: 2.034647, Train accuracy: 0.590635, Val accuracy: 0.436997
correct_samples= tensor(867) total_samples = 1495
loss: 1.268242, Train accuracy: 0.579933, Val accuracy: 0.670241
correct_samples= tensor(894) total_samples = 1495
loss: 1.568455, Train accuracy: 0.597993, Val accuracy: 0.710456
correct_samples= tensor(912) total_samples = 1495
loss: 1.161656, Train accuracy: 0.610033, Val accuracy: 0.471850
correct_samples= tensor(879) total_samples = 1495
loss: 2.565173, Train accuracy: 0.587960, Val accuracy: 0.568365
correct_samples= tensor(909) total_samples = 1495
loss: 1.186947, Train accuracy: 0.608027, 