In [2]:
################
# Dependencies #
################
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import math
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# split the training dataset into smaller chunks
def split_file(file_path, out_path, customers_per_file):
    customers_per_file = customers_per_file + 1
    ids = {}
    cnt = 0
    ids_file = open('{}/{}.csv'.format(out_path, "ids"), "w")
    ids_file.write("id,chunk\n")
    save = False
    with open(file_path) as bigfile:
        for lineno, line in enumerate(bigfile):
            if lineno == 0: continue    # skip header
            customer_ID = line.split(",")[0]    # get customer ID of current line

            sum_ids = len(ids.keys())

            # add customer to list of read customers
            if customer_ID in ids: ids[customer_ID].append(line)
            else:
                if sum_ids != (customers_per_file - 1): ids[customer_ID] = [line]
                else: save = True


            # write data of all read customers to a new file
            if save:
                data_file = open(f"{out_path}/data{cnt}.csv", "w")
                data_file.write("customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145\n")
                
                for key in ids:
                    data = ids[key]
                    # copy first element of series until series length is 13
                    if len(data) != 13: data = [data[0] for cnt in range(13 - len(data) )] + data 
                    ids_file.write(key + "," + str(cnt) + "\n")
                    for elem in data: data_file.write(elem)

                data_file.close()
                ids = {}
                ids[customer_ID] = [line]
                save = False
                cnt += 1

    data_file = open(f"{out_path}/data{cnt}.csv", "w")
    data_file.write("customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145\n")
                
    for key in ids:
        data = ids[key]
        # copy first element of series until series length is 13
        if len(data) != 13: data = [data[0] for cnt in range(13 - len(data) )] + data 
        ids_file.write(key + "," + str(cnt) + "\n")
        for elem in data: data_file.write(elem)

    data_file.close()
    ids = {}
    cnt += 1

    ids_file.close()

In [5]:
customers_per_file = 10000
#split_file("./train_data/train_data.csv","./train_data/train_data_chunks/", customers_per_file)

In [6]:
class AmericanExpressProfileTimeSeriesDatasetLowMemory(Dataset):
    def __init__(self, labels_file, data_folder, train=True, train_percentage=0.7, labels_available=True):
        self.labels_available = labels_available
        if self.labels_available: self.labels = pd.read_csv(labels_file)
        self.data_folder = data_folder
        self.chunk_indices = pd.read_csv(data_folder+"/ids.csv")  # maps from customer_ID to the number of the chunk which stores the date for the customer_ID
        
        self.start_index = 0 if train else math.ceil(len(self.chunk_indices) * train_percentage)
        self.end_index = math.floor(len(self.chunk_indices) * train_percentage) if train else len(self.chunk_indices)

        self.chunk_indices = self.chunk_indices.iloc[self.start_index:self.end_index]

        self.current_chunk_index = 0
        self.current_chunk_data = pd.read_csv(self.data_folder + "/data" +  str(self.current_chunk_index) + ".csv")
        self.train = train

    def __len__(self):
        return len(self.chunk_indices)  # use self.chunk_indices instead of self.labels as some customer data with series length != 13 was deleted

    def __getitem__(self, idx):
        entry = self.chunk_indices.iloc[idx]
        customer_ID = entry["id"]
        chunk_index = entry["chunk"]

        if chunk_index != self.current_chunk_index:
            self.current_chunk_index = chunk_index
            self.current_chunk_data = pd.read_csv(self.data_folder + "/data" + str(self.current_chunk_index) + ".csv")

        data = self.current_chunk_data[self.current_chunk_data["customer_ID"] == customer_ID]   # is slow for large files => use small chunks
        data = data.drop(['D_63', 'D_64', 'S_2', 'customer_ID'], axis=1)

        # TODO: can the time value be used? maybe as unix time number
        # TODO: create one hot encoding for missing 2 features

        data = data.fillna(0)   # TODO: Is it valid to fill all missing values with 0?
        data = data.values
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True).T
        data = data.to(device)

        if self.labels_available:
            label = self.labels[self.labels["customer_ID"] == customer_ID]["target"].values[0]
            label = torch.tensor(label, dtype=torch.float32)
            label = label.to(device)
            return data, label
        else: return data, customer_ID

In [7]:
train_dataset = AmericanExpressProfileTimeSeriesDatasetLowMemory("./train_data/train_labels.csv", "./train_data/train_data_chunks/", True)
train_dataloader = DataLoader(train_dataset, batch_size=64) # do not shuffle as this would make the data loader very slow

test_dataset = AmericanExpressProfileTimeSeriesDatasetLowMemory("./train_data/train_labels.csv", "./train_data/train_data_chunks/", False)
test_dataloader = DataLoader(test_dataset, batch_size=64) # do not shuffle as this would make the data loader very slow

In [8]:
class AmericanExpressProfileTimeSeriesDataset(Dataset):
    def __init__(self, dataset_file):
        self.dataset = pd.read_pickle(dataset_file)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):

        row = self.dataset.iloc[idx]
        label = row["target"]
        data = row.drop(['customer_ID', 'D_63', 'D_64', 'S_2', 'target'], axis=1)

        data = np.array(data, dtype=np.float32)
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)

        return data, label

In [9]:
###############################
# Neural Network Architecture #
###############################
class NeuralNetwork(nn.Module):

    def __init__(self):
        super(NeuralNetwork, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(186, 200, 3, stride=1),
            nn.LeakyReLU(),
            nn.Conv1d(200, 150, 4, stride=2),
            nn.LeakyReLU(),
            nn.Conv1d(150, 100, 2, stride=2),
            nn.LeakyReLU(),
            nn.AvgPool1d(2, stride=1),
        )

        self.lin = nn.Sequential(
            nn.Linear(100, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1)
        x = self.lin(x)
        x = x.view(x.shape[0])
        return x

In [10]:
#################################
# Neural Network Model Creation #
#################################
model = NeuralNetwork()
model = model.to(device)

In [11]:
#################################
# Neural Network Testing Method #
#################################
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [12]:
##################################
# Neural Network Training Method #
##################################
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [13]:
models = {}

In [14]:
#####################################
# Neural Network Training / Testing #
#####################################
#learning_rate = 1e-2
batch_size = 64
epochs = 5

loss_fn = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters())


for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    models[t] = model
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.705754  [    0/321239]
loss: 0.394563  [ 6400/321239]
loss: 0.273504  [12800/321239]
loss: 0.297438  [19200/321239]
loss: 0.242142  [25600/321239]
loss: 0.453587  [32000/321239]
loss: 0.257765  [38400/321239]
loss: 0.212439  [44800/321239]
loss: 0.287470  [51200/321239]
loss: 0.307891  [57600/321239]
loss: 0.230182  [64000/321239]
loss: 0.340282  [70400/321239]
loss: 0.262436  [76800/321239]
loss: 0.199133  [83200/321239]
loss: 0.174176  [89600/321239]
loss: 0.245485  [96000/321239]
loss: 0.177652  [102400/321239]
loss: 0.239323  [108800/321239]
loss: 0.312253  [115200/321239]
loss: 0.276236  [121600/321239]
loss: 0.392529  [128000/321239]
loss: 0.186904  [134400/321239]
loss: 0.192381  [140800/321239]
loss: 0.314477  [147200/321239]
loss: 0.228240  [153600/321239]
loss: 0.138480  [160000/321239]
loss: 0.206323  [166400/321239]
loss: 0.384298  [172800/321239]
loss: 0.181248  [179200/321239]
loss: 0.230546  [185600/321239]
loss: 0.148219  

In [None]:
#torch.save(model, "./model")
#model = torch.load("./model")
#model.eval()

In [None]:
#split_file("./test_data/test_data.csv","./test_data/test_data_chunks/", customers_per_file)

In [None]:
#validation_dataset = AmericanExpressProfileTimeSeriesDatasetLowMemory("", "./test_data/test_data_chunks/", True, 1, False)
#validation_dataloader = DataLoader(validation_dataset, batch_size=64) # do not shuffle as this would make the data loader very slow

In [None]:
def validation_loop(dataloader, model, out_file):

    with torch.no_grad():
        data_file = open(f"{out_file}.csv", "w")
        data_file.write("customer_ID,prediction\n")
        for X, customer_ID in dataloader:
            pred = model(X)
            pred = pred.tolist()
            for idx in range(len(customer_ID)):
                data_file.write(customer_ID[idx] + "," + str(pred[idx])+"\n")
        data_file.close()

In [None]:
#validation_loop(validation_dataloader, model, "./test_data/test_labels")