In [1]:
# import necessary packages
import sys, os, glob
import torch 
import numpy as np
import networkx as nx
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU
import utils.prepr`ocessing as pp

device = GetLowestGPU()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Device set to cuda:0


## Load Model and Data

In [2]:
# options
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

# load tokenizer and model
pipeline = pipeline('text-generation', 
                    model=model_path,
                    model_kwargs={'torch_dtype': torch.bfloat16},
                    device_map = 'auto'
                    )

pipeline.model = get_peft_model(pipeline.model, peft_config)
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.generation_config.pad_token_id = pipeline.tokenizer.eos_token_id

pipeline.model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [13]:
dataset_path = '../data/networks/DREAM4/DREAM4_in-silico_challenge/Size_10/DREAM4_gold_standards/'
graph_names = glob.glob(dataset_path + "*.tsv")
graph_names = [os.path.basename(x) for x in graph_names]

In [14]:
graph_names

['insilico_size10_4_goldstandard.tsv',
 'insilico_size10_5_goldstandard.tsv',
 'insilico_size10_1_goldstandard.tsv',
 'insilico_size10_3_goldstandard.tsv',
 'insilico_size10_2_goldstandard.tsv']

## Convert .txt Files to NetworkX Format

In [15]:
def txt_to_nx(filename, graph_type = nx.Graph):
    graph = nx.read_edgelist(filename,
                             create_using=graph_type,
                             nodetype=str,
                             data=(('weight', float),))
    return graph

In [16]:
# make empty list for graphs
graphs = []

for name in tqdm(graph_names):
    # get filename
    filename = dataset_path + name

    # load graph
    graph = txt_to_nx(filename)
    
    # append to list
    graphs.append(graph)

  0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
G = graphs[0]

<networkx.classes.graph.Graph at 0x7f64bc90ef70>

## Preprocess Data

In [18]:
# define functions
def network_to_text(G):

    nodes = []
    for x in G.nodes():
        nodes.append(x)

    # collect edges and weights
    edges = []
    for u,v in G.edges():
        edges.append("("+str(u)+","+str(v)+") with weight " + str(G.get_edge_data(u,v)['weight']))

    return(nodes, edges)

def format_chat(nodes, edges, question):
    row_json = [{'role': 'user',
                 'content': 'In and undirected weighted graph, (i,j) means that node i and node j are connected with an undirected, weighted edge. The nodes are: {} and the edges are: {}\n Is there a cycle in this graph?'},
                {'role': 'assistant', 'content': 'yes'}]
    row["text"] = pipeline.tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

def preprocess_data(examples):
    tokenized_data = pipeline.tokenizer(text=examples['text'],
                               padding='max_length', 
                               truncation=True, 
                               max_length=1024)
    
    labels = tokenized_data['input_ids'].copy()
    
    for i in range(len(labels)):
        if labels[i][-1] != pipeline.tokenizer.pad_token_id:
            labels[i] = labels[i][1:] + [pipeline.tokenizer.pad_token_id]
        else:
            labels[i] = labels[i][1:] + [-100]

    labels = [[-100 if x == pipeline.tokenizer.pad_token_id else x for x in y] for y in labels]
    tokenized_data['labels'] = labels
    
    return tokenized_data

## Create Dataloaders

In [19]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=pipeline.tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=8, 
                              collate_fn=data_collator,
                              num_workers=20)

val_dataloader = DataLoader(tokenized_dataset['validation'],
                            batch_size=8,
                            collate_fn=data_collator,
                            num_workers=2)

NameError: name 'tokenized_dataset' is not defined

In [None]:
# inspect sample batch
batch = next(iter(train_dataloader))
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = pipeline.model(**batch)
print(outputs.loss, outputs.logits.shape)

## Train Model

In [None]:
# options
num_batches = 10_000
num_epochs = 1
best_val_loss = np.inf
checkpoint_path = '../checkpoints/checkpoint_{0}.pt'
log_path = '../logs/log.csv'

# init optimizer
optimizer = AdamW(pipeline.model.parameters(), lr=1e-5)

# init scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_epochs * num_batches,
)

with open(log_path, 'w') as f: 
    f.write(f'epoch,iter_num,train_loss,val_loss\n')

# loop
for epoch in range(num_epochs):

    clear_output(wait=True)

    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # initialize train loss, val loss
    running_train_loss = 0.0
    running_val_loss = 0.0

    # loop through train data
    print("Training...")
    i = 0
    with tqdm(total=num_batches) as pbar:
        for train_batch, val_batch in zip(train_dataloader, val_dataloader):
            
            ## training
            # set model to train mode
            pipeline.model.train()

            # grab batch and map to device
            train_batch = {k: v.to(device) for k, v in train_batch.items()}

            # forward pass
            outputs = pipeline.model(**batch)
            train_loss = outputs.loss

            running_train_loss += train_loss.item()

            # backward pass
            train_loss.backward()
            # accelerator.backward(loss)

            # clip gradients
            torch.nn.utils.clip_grad_norm_(pipeline.model.parameters(), 1.0)

            # update optimizer, scheduler
            optimizer.step()
            lr_scheduler.step()

            # zero gradients
            optimizer.zero_grad()
            
            ## validation
            # set model to eval mode
            pipeline.model.eval()
            # loop through val data
            val_batch = {k: v.to(device) for k, v in val_batch.items()}
            with torch.no_grad():
                outputs = pipeline.model(**batch)
                val_loss = outputs.loss
                running_val_loss += val_loss.item()
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            print(f"Train Batch Loss: {train_loss:.4f} | Val Batch Loss: {val_loss:.4f} | Best Val. Loss: {best_val_loss:.4f}\r", end="")

            i += 1
            pbar.update(1)
            if i % 1000 == 0:

                # # save model checkpoint
                # checkpoint = {
                #     'model': pipeline.model.state_dict(),
                #     'optimizer': optimizer.state_dict(),
                #     'epoch': epoch,
                #     'iter_num': i,``
                #     'best_val_loss': best_val_loss,
                # }
                # torch.save(checkpoint, checkpoint_path.format(i))
                
            # write to log
            with open(log_path, 'a') as f: 
                f.write(f'{epoch},{i},{train_loss},{val_loss}\n')
            
            if i == num_batches:
                print(f"Reached {num_batches} batches; breaking...")
                break
    
    train_loss = running_train_loss / num_batches
    val_loss = running_val_loss / num_batches

    print("Epoch Complete; Printing example response...")
    print(pipeline(text, max_length=100, truncation=True))

    train_loss = running_train_loss / len(train_dataloader)
    print(f"Avg. Train Loss: {train_loss:.4f}, Avg. Val Loss: {val_loss:.4f}")
    # print("Evaluation metrics:", metric.compute())

print("Training Complete!")

## Evaluate Predictions on Benchmarks

In [21]:
# define questions
connectivity = "Is there a path between node {node1} and node {node2} in this undirected graph?"

cycle = "Is there a cycle in this undirected graph?"

top_sort = "Perform a topological sort of this undirected graph."

shortest_path = "What path between {node1} and {node2} minimizes the weights of their constituent edges?"

max_flow = "What is the maximum amount of flow that can travel through this undirected graph?"

bipartite_graph_matching = "Find the maximum bipartite matching of this graph."

hamilton_path = "Find all possible paths in this undirected graph that visit each vertex exactly once."

In [22]:
# get example graph for asking questions
ex = graphs[0]
nodes, edges = network_to_text(ex)
node_str = ", ".join(nodes[1:20])
edges_str = ", ".join(edges[1:20])

# get question
question = top_sort

message = [{'role': 'system', 'content': 'You are an expert on topology.'},
           {'role': 'user',
            'content': 'In and undirected weighted graph, (i,j) means that node i '\
            +'and node j are connected with an undirected, weighted edge. '\
            +f'The nodes are: {node_str} '\
            +f'and the edges are: {edges_str}. '\
            +f'{question}'}]

In [23]:
print(pipeline(message, max_new_tokens=2048, truncation=False)[0]['generated_text'])

[{'role': 'system', 'content': 'You are an expert on topology.'}, {'role': 'user', 'content': 'In and undirected weighted graph, (i,j) means that node i and node j are connected with an undirected, weighted edge. The nodes are: G2, G3, G9, G4, G5, G6, G7, G8, G10 and the edges are: (G1,G4) with weight 0.0, (G1,G5) with weight 0.0, (G1,G7) with weight 0.0, (G1,G3) with weight 0.0, (G1,G6) with weight 0.0, (G1,G8) with weight 0.0, (G1,G9) with weight 0.0, (G1,G10) with weight 0.0, (G2,G3) with weight 0.0, (G2,G9) with weight 0.0, (G2,G4) with weight 0.0, (G2,G5) with weight 0.0, (G2,G6) with weight 0.0, (G2,G7) with weight 0.0, (G2,G8) with weight 0.0, (G2,G10) with weight 0.0, (G3,G4) with weight 0.0, (G3,G6) with weight 0.0, (G3,G8) with weight 0.0. Perform a topological sort of this undirected graph.'}, {'role': 'assistant', 'content': 'A topological sort of an undirected graph is not possible because it is not a directed acyclic graph (DAG). In a topological sort, we can\'t have an e