In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import time

# Loading and Organizing Data

In [2]:
data = pd.read_csv('train.csv')
data.head()
data_eval = pd.read_csv('test.csv')

#all_ids = pd.read_csv('nodes/nodes.tsv',delimiter='\t', usecols=['id'])
total_nodes = 836625 #all_ids.max()['id'] + 1

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Model

In [4]:
class MyNetwork(nn.Module):
    def __init__(self, vocabulary_size):
        super(MyNetwork, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, 100) #best model = 80
        self.relu = nn.ReLU()
        
        self.lin1_1 = nn.Linear(100, 50)
        self.lin2_1 = nn.Linear(100, 50)
        self.lin1_2 = nn.Linear(50, 25)
        self.lin2_2 = nn.Linear(50, 25)
        
        self.lin_out1 = nn.Linear(50, 25)
        self.lin_out2 = nn.Linear(25, 10)
        self.lin_out3 = nn.Linear(10, 1)
        self.sig = nn.Sigmoid()

        
    def forward(self, x1, x2):
        x1 = self.embedding(x1)
        x1 = self.lin1_1(x1)
        x1 = self.relu(x1)
        x1 = self.lin1_2(x1)
        x1 = self.relu(x1)
        
        
        x2 = self.embedding(x2)
        x2 = self.lin2_1(x2)
        x2 = self.relu(x2)
        x2 = self.lin2_2(x2)
        x2 = self.relu(x2)
        
        x = torch.cat((x1, x2), dim=1)
        x = self.lin_out1(x)
        x = self.relu(x)
        x = self.lin_out2(x)
        x = self.relu(x)
        x = self.lin_out3(x)
        x = self.sig(x)
        
        x = x.squeeze()
        return x

In [5]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        source_node = self.data.iloc[i]['id1']
        target_node = self.data.iloc[i]['id2']
        link = self.data.iloc[i]['label']
        return source_node, target_node, link
train_loader = DataLoader(MyDataset(train_data), batch_size=1024, shuffle=True)

In [6]:
model = MyNetwork(total_nodes)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0075)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
model.train()
for epoch in range(0, 2):
    for batch_idx, (id1, id2, link) in enumerate(train_loader):
        id1, id2, link = id1.to(device), id2.to(device), link.to(device)
        optimizer.zero_grad()
        output = model(id1.type(torch.int), id2.type(torch.int))

        loss = criterion(output, link.float())
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} {batch_idx * len(id1)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%) - Loss: {loss.item()}')
 

Epoch: 0 0/758585 (0%) - Loss: 0.6970497965812683
Epoch: 0 102400/758585 (13%) - Loss: 1.9896033336408436e-05
Epoch: 0 204800/758585 (27%) - Loss: 1.9273200450697914e-05
Epoch: 0 307200/758585 (40%) - Loss: 1.7723590644891374e-05
Epoch: 0 409600/758585 (54%) - Loss: 3.4816077914001653e-06
Epoch: 0 512000/758585 (67%) - Loss: 3.523344503264525e-06
Epoch: 0 614400/758585 (81%) - Loss: 2.45422938860429e-06
Epoch: 0 716800/758585 (94%) - Loss: 3.322026168461889e-05
Epoch: 1 0/758585 (0%) - Loss: 2.0927167497575283e-05
Epoch: 1 102400/758585 (13%) - Loss: 1.703457655821694e-06
Epoch: 1 204800/758585 (27%) - Loss: 1.3549104096455267e-06
Epoch: 1 307200/758585 (40%) - Loss: 5.7183926401194185e-05
Epoch: 1 409600/758585 (54%) - Loss: 7.025753347988939e-06
Epoch: 1 512000/758585 (67%) - Loss: 3.281152658018982e-06
Epoch: 1 614400/758585 (81%) - Loss: 3.615905006881803e-05
Epoch: 1 716800/758585 (94%) - Loss: 2.8207950890646316e-05


# Validation

In [8]:
with torch.no_grad():
    y_pred = model(torch.tensor(np.array(test_data['id1'])).type(torch.int),torch.tensor(np.array(test_data['id2'])).type(torch.int))
    

In [9]:
test_score = f1_score(test_data['label'], torch.round(y_pred))
print("F1 Score:", test_score)

F1 Score: 0.9999770210028034


# Training in the entire training dataset

In [10]:
train_loader = DataLoader(MyDataset(data), batch_size=1024, shuffle=True)

#model = MyNetwork(total_nodes)
#criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
start = time.time()
model.train()
for epoch in range(0, 2):
    for batch_idx, (id1, id2, link) in enumerate(train_loader):
        id1, id2, link = id1.to(device), id2.to(device), link.to(device)
        optimizer.zero_grad()
        output = model(id1.type(torch.int), id2.type(torch.int))

        loss = criterion(output, link.float())
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} {batch_idx * len(id1)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%) - Loss: {loss.item()}')
            
            
print("Elapsed time:")
end = time.time()
print(end - start)


Epoch: 0 0/948232 (0%) - Loss: 1.967428886473499e-07
Epoch: 0 102400/948232 (11%) - Loss: 3.530887795477611e-07
Epoch: 0 204800/948232 (22%) - Loss: 4.79039954370819e-07
Epoch: 0 307200/948232 (32%) - Loss: 7.262039503075357e-07
Epoch: 0 409600/948232 (43%) - Loss: 9.249337722394557e-07
Epoch: 0 512000/948232 (54%) - Loss: 3.131585515347979e-07
Epoch: 0 614400/948232 (65%) - Loss: 1.063759214048332e-06
Epoch: 0 716800/948232 (76%) - Loss: 1.7671978014277556e-07
Epoch: 0 819200/948232 (86%) - Loss: 1.2997817577797832e-07
Epoch: 0 921600/948232 (97%) - Loss: 2.5373151402163785e-07
Epoch: 1 0/948232 (0%) - Loss: 2.082680481407806e-07
Epoch: 1 102400/948232 (11%) - Loss: 1.1746323025363381e-07
Epoch: 1 204800/948232 (22%) - Loss: 2.452962917232071e-07
Epoch: 1 307200/948232 (32%) - Loss: 3.387694746948e-08
Epoch: 1 409600/948232 (43%) - Loss: 3.521573432863079e-08
Epoch: 1 512000/948232 (54%) - Loss: 1.3469446003000485e-07
Epoch: 1 614400/948232 (65%) - Loss: 3.15485912949498e-08
Epoch: 1 

In [12]:
with torch.no_grad():
    predictions = model(torch.tensor(np.array(data_eval['id1'])).type(torch.int),torch.tensor(np.array(data_eval['id2'])).type(torch.int))
    

In [13]:
submission = pd.read_csv('sample_submission.csv', delimiter=',')
submission['label'] = torch.round(predictions)
submission['label'] = submission['label'].astype('int')
submission.to_csv('out_retrained.csv',index=False)

In [14]:
torch.save(model.state_dict(), "model_retrained.pt")

# Refined Training

In [15]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)

model.train()
for epoch in range(0, 1):
    for batch_idx, (id1, id2, link) in enumerate(train_loader):
        id1, id2, link = id1.to(device), id2.to(device), link.to(device)
        optimizer.zero_grad()
        output = model(id1.type(torch.int), id2.type(torch.int))

        loss = criterion(output, link.float())
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} {batch_idx * len(id1)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%) - Loss: {loss.item()}')
            
            
            

Epoch: 0 0/948232 (0%) - Loss: 4.970949873950303e-08
Epoch: 0 102400/948232 (11%) - Loss: 2.3283064365386963e-10
Epoch: 0 204800/948232 (22%) - Loss: 9.313227966600834e-10
Epoch: 0 307200/948232 (32%) - Loss: 2.3283064365386963e-10
Epoch: 0 409600/948232 (43%) - Loss: 1.0477382295093207e-09
Epoch: 0 512000/948232 (54%) - Loss: 1.688022943646672e-09
Epoch: 0 614400/948232 (65%) - Loss: 2.3283064365386963e-10
Epoch: 0 716800/948232 (76%) - Loss: 5.820766091346741e-11
Epoch: 0 819200/948232 (86%) - Loss: 1.4551917448812901e-09
Epoch: 0 921600/948232 (97%) - Loss: 4.88945106624783e-09


In [16]:
with torch.no_grad():
    predictions = model(torch.tensor(np.array(data_eval['id1'])).type(torch.int),torch.tensor(np.array(data_eval['id2'])).type(torch.int))
    

In [17]:
submission = pd.read_csv('sample_submission.csv', delimiter=',')
submission['label'] = torch.round(predictions)
submission['label'] = submission['label'].astype('int')
submission.to_csv('out_retrained_refined.csv',index=False)

In [18]:
torch.save(model.state_dict(), "model_retraine_refined.pt")