<a href="https://colab.research.google.com/github/HowardQian201/2022RiceDatathon/blob/main/BILL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import libraries
import pandas as pd
import numpy
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
import csv
import torchaudio
import json
import random
from collections import Counter

In [None]:
# create graph dataset
training_graph = pd.read_csv("/content/training_graph.csv").values
training_set = set(map(tuple, training_graph))

isolated_nodes = pd.read_csv("/content/isolated_nodes.csv").values
isolated_nodes = set(isolated_nodes.flatten())

node_set = random.sample(set(training_graph.flatten()).difference(isolated_nodes), 360)

test_graph = torch.tensor(pd.read_csv("/content/test_edges.csv").values)
page_types = pd.read_csv("/content/node_classification.csv", header=None, index_col=0,squeeze=True).to_dict()
page_types.pop("id")

f = open("/content/node_features_text.json")
node_features = json.load(f)
word_counts = Counter()
for node in node_features.values():
    #print(node)
    # for word in node:
    word_counts.update(node)
delete = zip(*word_counts.most_common(50))
# print(word_counts.most_common(50))

for key in node_features.keys():
    for num in node_features[key]:
        if num in delete:
            node_features[key].remove(num)
    node_features[key] = set(node_features[key])

# [ ((1,3),1), ((0,1),0),.... ]
train_data = []
for node1 in node_set:
    for node2 in node_set:
        if node1 != node2:
            shared_feature_count = len(node_features[str(node1)].intersection(node_features[str(node2)]))
            if (node1, node2) in training_set or (node2, node1) in training_set:
                train_data.append((torch.tensor([1 if page_types[str(node1)] == page_types[str(node2)] else 0, shared_feature_count]).float(), numpy.float32(1)))
            else:
                train_data.append((torch.tensor([1 if page_types[str(node1)] == page_types[str(node2)] else 0, shared_feature_count]).float(), numpy.float32(0)))

for edge in training_graph:
    id1 = str(edge[0].item())
    id2 = str(edge[1].item())
    shared_feature_count = len(node_features[id1].intersection(node_features[id2]))
    train_data.append((torch.tensor([1 if page_types[id1] == page_types[id2] else 0, shared_feature_count]).float(), numpy.float32(1)))

# finished creating training data

filtered = filter(lambda x: x[1] == 1, train_data)
counter = 0
for item in filtered:
    #print(item)
    counter += 1
#print(counter)


FileNotFoundError: ignored

In [None]:
test_labels = pd.read_csv("/content/test_labels.csv").values
test_edges = pd.read_csv("/content/test_edges.csv").values

test_data = []
for i in range(len(test_edges)):
    id1 = str(test_edges[i][0])
    id2 = str(test_edges[i][1])
    shared_feature_count = len(node_features[id1].intersection(node_features[id2]))
    test_data.append((torch.tensor([1 if page_types[id1] == page_types[id2] else 0, shared_feature_count]).float(), numpy.float32(test_labels[i][0])))

In [None]:
# reserving 1000 as validation data
print(len(train_data))
train_ds, val_ds = random_split(train_data, [len(train_data) - 1000, 1000])
print(type(train_data[0][1]))
batch_size = 32

train_loader = DataLoader(train_ds, batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size*2, num_workers=2, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size*2, num_workers=2, pin_memory=True, drop_last=True)

261278
<class 'numpy.float32'>


In [None]:
class TeacherModel(nn.Module):
    def __init__(self):
        super().__init__()

        # need to change initialization
        # init_val = 0.9
        self.linear = nn.Linear(2, 1)
        # torch.nn.init.uniform_(self.linear.weight, -1 * init_val, init_val)

    def forward(self, x):
        out = x.view(x.size(0), x.size(1))

        out = self.linear(out)
        out = F.relu(out)

        return out

    def training_step(self, batch):
        images, labels = batch 
        out = torch.flatten(self(images))  # Generate predictions
        # print("here", type(out[0].item()))
        loss = F.mse_loss(out, labels)  # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch
        out = torch.flatten(self(images))                    # Generate predictions
        loss = F.mse_loss(out, labels)   # Calculate loss
        new_weights = torch.sqrt(self.linear.weight)
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}

    def calc_acc(self, predictions):
        print('here')
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}".format(epoch, result['val_loss']))

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.Adam):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            # print(type(batch[0][0][1].item()))
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [None]:
learning_rate = 0.1
epochs = 10
model = TeacherModel()
history = [evaluate(model, val_loader)]
print(history)
history += fit(epochs, learning_rate, model, train_loader, val_loader)

print(evaluate(model, test_loader))

[{'val_loss': 0.36310136318206787}]
Epoch [0], val_loss: 0.1359
Epoch [1], val_loss: 0.1367
Epoch [2], val_loss: 0.1414
Epoch [3], val_loss: 0.1345
Epoch [4], val_loss: 0.1384
Epoch [5], val_loss: 0.1482
Epoch [6], val_loss: 0.1331
Epoch [7], val_loss: 0.1490
Epoch [8], val_loss: 0.1641
Epoch [9], val_loss: 0.1451
{'val_loss': 0.14541548490524292}
