## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/test_gnn.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

author1,author2,common_neighbors,total_neighbors,preferential_attachment,jaccard,adamic_adar,resource_allocation,leicht_holme_nerman,sorensen_index,salton_cosine_similarity,hub_promoted,hub_depressed,target
2137410377,248131350,0,52,576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137434199,563504378,0,51,230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137503162,2166681051,0,61,774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137806763,2810326837,0,11,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137812506,2229290118,0,6,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137937141,2218871432,0,14,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2137994469,2156486240,0,79,1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2138130113,2414494727,0,12,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2138265084,2692611460,0,10,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2138645338,2665943088,0,31,84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [0]:
# let the edge bacome the node
# df.cache()
# load tabular data
df.toPandas()

Unnamed: 0,author1,author2,common_neighbors,total_neighbors,preferential_attachment,jaccard,adamic_adar,resource_allocation,leicht_holme_nerman,sorensen_index,salton_cosine_similarity,hub_promoted,hub_depressed,target
0,2137410377,248131350,0,52,576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2137434199,563504378,0,51,230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2137503162,2166681051,0,61,774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2137806763,2810326837,0,11,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2137812506,2229290118,0,6,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,2137937141,2218871432,0,14,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,2137994469,2156486240,0,79,1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,2138130113,2414494727,0,12,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,2138265084,2692611460,0,10,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,2138645338,2665943088,0,31,84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [0]:
# spark dataframe 2 pandas dataframe 2 numpy dataframe
train_dataset = df.toPandas().to_numpy().astype(float) # the data is string, change the straing to the double

In [0]:
# show the train data\
print(type(train_dataset[0][0]))
print(train_dataset.shape)

In [0]:
# split the train dataset to train and dev
# mark the edge feature with label "0" or "1"
import numpy as np
def split_train(data,test_ratio):
    shuffled_indices=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices =shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data[train_indices],data[test_indices]
train_data, test_data = split_train(train_dataset, 0.25)

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from collections import namedtuple

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

dbutils.widgets.removeAll()
dbutils.widgets.dropdown('USE_GPU', 'no', ['no', 'yes'])
USE_GPU = dbutils.widgets.get('USE_GPU') == 'no'

train_data = torch.from_numpy(train_data.astype(np.float32))
test_data = torch.from_numpy(test_data.astype(np.float32))

In [0]:
print(type(train_data[0,3]))
train_data[0,3]

In [0]:
from torch.utils.data import Dataset, TensorDataset
MNIST_DIR = '/tmp/data/mnist'
use_cuda = USE_GPU and torch.cuda.is_available()

Params = namedtuple('Params', ['batch_size', 'test_batch_size', 'epochs', 'lr', 'momentum', 'seed', 'cuda', 'log_interval'])
args = Params(batch_size=200, test_batch_size=10000, epochs=100, lr=0.01, momentum=0.5, seed=1, cuda=use_cuda, log_interval=200)
torch.manual_seed(args.seed)

# data_transform_fn = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

# train_loader = torch.utils.data.DataLoader(datasets.MNIST(MNIST_DIR, train=True, download=True,transform=data_transform_fn),batch_size=args.batch_size, shuffle=True, num_workers=1)
# test_loader = torch.utils.data.DataLoader(datasets.MNIST(MNIST_DIR, train=False, transform=data_transform_fn),batch_size=args.test_batch_size, shuffle=True, num_workers=2)
train_loader = torch.utils.data.DataLoader(TensorDataset(train_data[:, 2:-1],train_data[:,-1].long()), batch_size=args.batch_size, shuffle=True, num_workers=1)
test_loader = torch.utils.data.DataLoader(TensorDataset(test_data[:, 2:-1],test_data[:,-1].long()), batch_size=args.batch_size, shuffle=True, num_workers=1)

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        #self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        #self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(11, 50)
        self.fc2 = nn.Linear(50, 2)

    def forward(self, x):
        #x = F.relu(F.max_pool2d(self.conv1(x), 2))
        #x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        #x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)
        # return x
    
    def predict(self,x):
        pred = F.softmax(self.forward(x))
        ans = []
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)
      
model = Net()
model.share_memory() # gradients are allocated lazily, so they are not shared here

In [0]:

def train_epoch(epoch, args, model, data_loader, optimizer):
    model.train()
    for batch_idx, (data, target) in enumerate(data_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()      
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        # print(output.shape)
        # print(target.shape)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(data_loader.dataset),
                100. * batch_idx / len(data_loader), loss.data.item()))


def test_epoch(model, data_loader):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in data_loader:
        if args.cuda:
            data, target = data.cuda(), target.cuda()      
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).data.item() # sum up batch loss
        pred = output.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()

    test_loss /= len(data_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(data_loader.dataset),
        100. * correct / len(data_loader.dataset)))

def eval(net, testloader):
    net.eval()
    test_loss = 0
    correct = 0
    total = 0

    classnum = 2
    target_num = torch.zeros((1,classnum))
    predict_num = torch.zeros((1,classnum))
    acc_num = torch.zeros((1,classnum))

    for batch_idx, (inputs, targets) in enumerate(testloader):
        if use_cuda:
            inputs, targets = inputs.cuda(), targets.cuda()
        inputs, targets = Variable(inputs, volatile=True), Variable(targets)
        outputs = net(inputs)
        loss = F.nll_loss(outputs, targets)
        # loss is variable , if add it(+=loss) directly, there will be a bigger ang bigger graph.
        test_loss += loss.data.data.item()
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).cpu().sum()

        pre_mask = torch.zeros(outputs.size()).scatter_(1, predicted.cpu().view(-1, 1), 1.)
        predict_num += pre_mask.sum(0)
        tar_mask = torch.zeros(outputs.size()).scatter_(1, targets.data.cpu().view(-1, 1), 1.)
        target_num += tar_mask.sum(0)
        acc_mask = pre_mask*tar_mask
        acc_num += acc_mask.sum(0)

    recall = acc_num/target_num
    precision = acc_num/predict_num
    F1 = 2*recall*precision/(recall+precision)
    accuracy = acc_num.sum(1)/target_num.sum(1)

    recall = (recall.numpy()[0]*100).round(3)
    precision = (precision.numpy()[0]*100).round(3)
    F1 = (F1.numpy()[0]*100).round(3)
    accuracy = (accuracy.numpy()[0]*100).round(3)

    print('recall'," ".join('%s' % id for id in recall))
    print('precision'," ".join('%s' % id for id in precision))
    print('F1'," ".join('%s' % id for id in F1))
    print('accuracy',accuracy)
    
# Run the training loop over the epochs (evaluate after each)
if args.cuda:
    model = model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
    train_epoch(epoch, args, model, train_loader, optimizer)
    test_epoch(model, test_loader)
    eval(model, test_loader)