<a href="https://colab.research.google.com/github/Lamiaka/GNN_amld2022/blob/main/Solution_to_Single_Graph_%7C_AMLD_GNNs_for_Structured_Data_Workshop_by_Volodya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# README

## A solution to the Exercise: Single Graph

This is an example colab for Graph Neural Networks for Structured Data Workshop @ AMLD 2021 by volodya@google.com

Link to this colab: https://bit.ly/gnn-str-ex-3-sol



## Loading, splitting and preparing the data

This is the same as in https://bit.ly/gnn-str-ex-2-solution

In [None]:
!pip install folium==0.8.3 "imgaug<0.2.7,>=0.2.5" spektral

!wget 'https://drive.google.com/uc?id=1m3ACccA3GLg5GFg0D9khUNnt_c10gBAK&export=download' -O stocks-with-label_3days.prq

# NOTE: When running for the first time, you will get an error asking to restart runtime - click `Restart Runtime` button to do so and rerun the cells.

Collecting imgaug<0.2.7,>=0.2.5
  Downloading imgaug-0.2.6.tar.gz (631 kB)
[K     |████████████████████████████████| 631 kB 5.1 MB/s 
[?25hCollecting spektral
  Downloading spektral-1.0.8-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 47.2 MB/s 
Collecting numpy
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 41.2 MB/s 
Collecting tensorflow>=2.1.0
  Downloading tensorflow-2.7.1-cp37-cp37m-manylinux2010_x86_64.whl (495.0 MB)
[K     |████████████████████████████████| 495.0 MB 27 kB/s 
Collecting keras<2.8,>=2.7.0rc0
  Downloading keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 39.4 MB/s 
Collecting tensorflow-estimator<2.8,~=2.7.0rc0
  Downloading tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
[K     |████████████████████████████████| 463 kB 52.7 MB/s 
Collecting gast<0.5.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.

--2022-03-26 23:08:06--  https://drive.google.com/uc?id=1m3ACccA3GLg5GFg0D9khUNnt_c10gBAK&export=download
Resolving drive.google.com (drive.google.com)... 142.250.98.138, 142.250.98.102, 142.250.98.139, ...
Connecting to drive.google.com (drive.google.com)|142.250.98.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-5c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/a02qsotlvbehi99ngqcggq41u749ef69/1648336050000/01333786639642341364/*/1m3ACccA3GLg5GFg0D9khUNnt_c10gBAK?e=download [following]
--2022-03-26 23:08:07--  https://doc-08-5c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/a02qsotlvbehi99ngqcggq41u749ef69/1648336050000/01333786639642341364/*/1m3ACccA3GLg5GFg0D9khUNnt_c10gBAK?e=download
Resolving doc-08-5c-docs.googleusercontent.com (doc-08-5c-docs.googleusercontent.com)... 173.194.214.132, 2607:f8b0:400c:c32::84
Connecting to doc-08-5c-docs.googleusercontent.com (doc-08-5c-doc

In [None]:
import tensorflow as tf
from datetime import date
import pandas as pd
import numpy as np
import os

from typing import Dict, List, Text

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler

from scipy.sparse import csr_matrix

import spektral

import tqdm

pd.options.display.float_format = '{:,.2f}'.format

In [None]:
stocks_df = pd.read_parquet('stocks-with-label_3days.prq')

ticker_column = pd.Categorical(stocks_df['ticker'])
stocks_df['ticker'] = ticker_column
stocks_df['ticker'] = stocks_df.ticker.cat.codes

split_config = {
    'test_dt': pd.Timestamp('2020-01-01'), # 2020 onwards goes into test.
    'valid_dt': pd.Timestamp('2015-01-01'),
}

test_df = stocks_df.query('Date >= @test_dt', local_dict=split_config).copy()
train_df = stocks_df.query('Date < @valid_dt', local_dict=split_config).copy()
valid_df = stocks_df.query('Date >= @valid_dt and Date < @test_dt', local_dict=split_config).copy()

print(f'Train: {len(train_df)} Valid: {len(valid_df)} Test: {len(test_df)}')

columns_to_scale = list(set(train_df.columns) - {'Date', 'ticker', 'LABEL'})

std_scaler = StandardScaler()

def convert_df(df, fit_scaler=False):
  df.fillna(-1, inplace=True)
  for c, d in zip(df.columns, df.dtypes):
    if d == np.float64:
      df[c].astype('float32', copy=False)
  
  scaler_func = std_scaler.fit_transform if fit_scaler else std_scaler.transform
  df.loc[:, columns_to_scale] = scaler_func(df[columns_to_scale])

  return df


train_df = convert_df(train_df, fit_scaler=True)
valid_df = convert_df(valid_df, fit_scaler=False)
test_df = convert_df(test_df, fit_scaler=False)

train_df.head(10)

Train: 139260 Valid: 28934 Test: 8970


Unnamed: 0,ticker,Date,LABEL,High,Low,Close,Open,High_1,Low_1,Close_1,Open_1
147348,5,1991-04-01,1.03,-0.36,-0.35,-0.35,-0.35,-0.35,-0.35,-0.35,-0.35
23857,13,2011-08-24,1.05,0.12,0.1,0.11,0.12,0.12,0.09,0.13,0.09
100623,12,2006-12-05,1.0,-0.07,-0.07,-0.07,-0.07,-0.07,-0.07,-0.07,-0.07
99527,12,2002-07-31,0.94,-0.2,-0.2,-0.2,-0.2,-0.19,-0.2,-0.19,-0.2
6641,16,1999-06-08,1.06,0.38,0.39,0.39,0.39,0.38,0.39,0.38,0.38
150940,5,2005-06-28,0.97,-0.08,-0.08,-0.08,-0.08,-0.08,-0.08,-0.08,-0.08
149439,5,1999-07-09,1.02,-0.08,-0.08,-0.08,-0.07,-0.08,-0.07,-0.07,-0.07
16626,19,2002-02-06,1.01,-0.31,-0.31,-0.31,-0.31,-0.31,-0.31,-0.31,-0.31
155209,10,1981-02-03,1.02,-0.46,-0.46,-0.46,-0.49,-0.47,-0.46,-0.47,-0.49
88989,9,2005-04-21,1.01,0.35,0.36,0.35,0.35,0.36,0.36,0.35,0.36


##  Preparing a Single Graph

In [None]:
feature_dtype = np.float32
feature_names = ['ticker', 'High', 'Low', 'Close', 'Open', 'High_1', 'Low_1', 'Close_1', 'Open_1']

def get_features(example: Dict):
  return np.array([example[name] for name in feature_names], dtype=feature_dtype)

def get_a(df):
  df_by_ind = df.reset_index(drop=True)

  def map_row(row):
    return df_by_ind.index[df.Date == row.name].tolist()

  connected_indicies = pd.DataFrame(data=df.apply(map_row, axis='columns').values)

  pairs = connected_indicies.apply(lambda r: [(r.name, v) for v in r[0]], 
                                   axis='columns')
  pairs = pairs.explode(ignore_index=True)

  rows = pairs.apply(lambda r: r[0])
  cols = pairs.apply(lambda r: r[1])

  data = np.repeat(1, len(pairs))
  return csr_matrix((data, (rows, cols)))

class StocksSingleGraphDataset(spektral.data.Dataset):
  def __init__(self, df: pd.DataFrame, **kwargs):
    self.df = df
    super().__init__(**kwargs)

  def read(self):
    nodes = [get_features(example) for example in tqdm.tqdm(self.df.to_dict('records'))]
        
    x = np.array(nodes, dtype=feature_dtype).reshape(len(nodes), len(feature_names))

    df = self.df.sort_values('Date').set_index('Date', drop=False)
    a = get_a(df)
    e = None # No edge features.
    # No longer graph level label, but a node level label
    y = (df['LABEL'] > 1.01).values.reshape( (1, len(nodes)) ) 
    return [spektral.data.Graph(x=x, a=a, y=y, e=e)]

one_ds = StocksSingleGraphDataset(
    pd.concat([
           train_df,
           valid_df,
           test_df,
    ], axis='rows'))

assert one_ds[0].n_nodes == (len(train_df) + len(valid_df) + len(test_df))

100%|██████████| 177164/177164 [00:01<00:00, 149961.99it/s]


## Model training

Note sample_weights argument for the loaders

In [None]:
from spektral.models.general_gnn import GeneralGNN
from spektral.models.gcn import GCN
from spektral.data.loaders import DisjointLoader, SingleLoader, MixedLoader
from tensorflow.keras.losses import BinaryCrossentropy

model = GeneralGNN(
    one_ds.n_labels,
    hidden_activation='relu',
    dropout=.5,
    aggregate='mean',
    pool='avg',
    hidden=32,
    activation='sigmoid')

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False))

def get_mask(df):
  parts = [
           np.repeat( 1 if i is df else 0, len(i))
           for i in [train_df, valid_df, test_df]
  ]
  return np.concatenate(parts)


train_loader = SingleLoader(one_ds, sample_weights=get_mask(train_df))
valid_loader = SingleLoader(one_ds, sample_weights=get_mask(valid_df))

model.fit(
    train_loader.load(),
    steps_per_epoch=train_loader.steps_per_epoch,
    validation_data=valid_loader.load(),
    validation_steps=valid_loader.steps_per_epoch,
    epochs=16)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.History at 0x7f82b3ee8710>

In [None]:
test_loader = SingleLoader(one_ds, sample_weights=get_mask(test_df))

model.evaluate(test_loader.load(), steps=test_loader.steps_per_epoch)



0.03489769995212555