# How to work with categorical features

##### Notebook shows how to work with categorical features using `Penn94` dataset as an example of dataset with categorical features

In [2]:
from cool_graph.runners import Runner
from torch_geometric.data import Data
from torch_geometric import datasets
import numpy as np
import scipy
import pandas as pd
import torch

In [3]:
# function to descript a dataset
def dataset_info(data):
    n_features = data.x.shape[1]
    n_nodes = data.x.shape[0]
    n_edges = data.edge_index.shape[1]
    if len(data.y.shape) == 1:
        print(f'# nodes    {n_nodes} \n# features {n_features} \n# edges    {n_edges} \n# classes  {len(data.y.unique())}')
    else:
        print(f'# nodes    {n_nodes} \n# features {n_features} \n# edges    {n_edges} \n# tasks    {data.y.shape[1]}')

In [9]:
# importing Penn94 dataset loader
from cool_graph.datasets.nonhomophilous import NonHomophilous
dataset = NonHomophilous(root="./data", name="Penn94")
data = dataset.data

Downloading https://github.com/CUAI/Non-Homophily-Large-Scale/raw/master/data/facebook100/Penn94.mat
Preprocessing 
Processing 
dataset saved as ./data/penn94/Penn94_data.pt


In [10]:
dataset_info(data)

# nodes    41554 
# features 6 
# edges    2724458 
# classes  3


##### In this dataset, all the features are categorical. They are encoded, for example, the student's graduation year, school number, and his specialty


In [11]:
# checking features in dataset
unique_cnt = [len(np.unique(data.x[:, i])) for i in range(data.x.shape[1])]
print(data.x[0:5].long())
print(unique_cnt)

tensor([[    1,   403,   517,     0,  2006, 51881],
        [    1,   461,   378,     0,     0, 21512],
        [    1,   337,   349,     0,     0,  9092],
        [    1,   294,   378,     0,  2007, 20895],
        [    1,   273,   362,     0,     0,     0]])
[7, 274, 288, 167, 39, 4039]


##### As we see, features are categorical

## Processing categorical features

##### in CoolGraph categorical features are automatically transformed into embeddings. To do this write these features in data.x_cat

In [12]:
# defining categorical features in data.x_cat, so they will be transformed into embeddings
data.x_cat = data.x[:, 0:6]
data.x = data.x[:, 0:1]

In [7]:
# initializing runner
runner = Runner(data)

In [8]:
# training
result = runner.run()

Sample data: 100%|██████████| 117/117 [00:23<00:00,  4.97it/s]
Sample data: 100%|██████████| 39/39 [00:07<00:00,  4.89it/s]
2024-06-13 07:25:28.445 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 0.527, 'cross_entropy': 0.69, 'f1_weighted': 0.364, 'calc_time': 0.012, 'main_metric': 0.527}
2024-06-13 07:25:30.291 | INFO     | cool_graph.train.helpers:eval_epoch:209 - train:
 {'accuracy': 0.526, 'cross_entropy': 0.688, 'f1_weighted': 0.363, 'calc_time': 0.031, 'main_metric': 0.526}
2024-06-13 07:26:01.357 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 0.731, 'cross_entropy': 0.547, 'f1_weighted': 0.728, 'calc_time': 0.012, 'main_metric': 0.731}
2024-06-13 07:26:03.614 | INFO     | cool_graph.train.helpers:eval_epoch:209 - train:
 {'accuracy': 0.737, 'cross_entropy': 0.539, 'f1_weighted': 0.735, 'calc_time': 0.038, 'main_metric': 0.737}
2024-06-13 07:26:39.307 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 

In [9]:
# checking the result
result['best_loss']

{'accuracy': 0.771,
 'cross_entropy': 0.462,
 'f1_weighted': 0.771,
 'calc_time': 0.01,
 'main_metric': 0.771,
 'tasks': {'y': {'accuracy': 0.7705070074196207,
   'cross_entropy': 0.4619738757610321,
   'f1_weighted': 0.770662604144089}},
 'epoch': 20}

## Index as a feature
If you need to use an index as a feature, CoolGraph has the use_index_as_feature flag

In [10]:
runner_with_id = Runner(data, use_index_as_feature=True)

In [11]:
result2 = runner_with_id.run()

Sample data: 100%|██████████| 117/117 [00:22<00:00,  5.19it/s]
Sample data: 100%|██████████| 39/39 [00:07<00:00,  5.20it/s]
2024-06-13 07:28:25.794 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 0.527, 'cross_entropy': 0.69, 'f1_weighted': 0.364, 'calc_time': 0.012, 'main_metric': 0.527}
2024-06-13 07:28:27.631 | INFO     | cool_graph.train.helpers:eval_epoch:209 - train:
 {'accuracy': 0.526, 'cross_entropy': 0.69, 'f1_weighted': 0.363, 'calc_time': 0.031, 'main_metric': 0.526}
2024-06-13 07:28:54.319 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 0.728, 'cross_entropy': 0.552, 'f1_weighted': 0.727, 'calc_time': 0.01, 'main_metric': 0.728}
2024-06-13 07:28:56.085 | INFO     | cool_graph.train.helpers:eval_epoch:209 - train:
 {'accuracy': 0.737, 'cross_entropy': 0.54, 'f1_weighted': 0.736, 'calc_time': 0.029, 'main_metric': 0.737}
2024-06-13 07:29:22.209 | INFO     | cool_graph.train.helpers:eval_epoch:209 - test:
 {'accuracy': 0.7

In [12]:
result2['best_loss']

{'accuracy': 0.768,
 'cross_entropy': 0.47,
 'f1_weighted': 0.767,
 'calc_time': 0.011,
 'main_metric': 0.768,
 'tasks': {'y': {'accuracy': 0.7679307502061006,
   'cross_entropy': 0.47012466192245483,
   'f1_weighted': 0.7674146249402316}},
 'epoch': 10}

In [None]:
%%time
#let's try HypeRunner
from cool_graph.runners import HypeRunner
hyperunner = HypeRunner(data, verbose=False)
result3 = hyperunner.optimize_run(n_trials=10)

In [None]:
result3