In [1]:
import torch
from dataloader import GraphDataset, GraphTextDataset, TextDataset, GraphTextInMDataset
import networkx as nx

from transformers import AutoTokenizer

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext')

In [None]:
import os
import shutil
from tqdm import tqdm

source_directory = r"C:\Antoine\Study\Master 2 - MVA\ALTEGRAD\Challenge\Public\Public\data"
destination_directory = r"C:\Antoine\Study\Master 2 - MVA\ALTEGRAD\Altegrad-MVA-2023-2024\data"

def move_files(source_directory, destination_directory):
    # Make sure the destination directory exists, create it if not
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    # Get the total number of files to track progress
    total_files = sum([len(files) for _, _, files in os.walk(source_directory)])

    # Initialize the tqdm progress bar
    progress_bar = tqdm(total=total_files, desc="Moving files", unit="file")

    # Walk through the source directory and its subdirectories
    for root, _, files in os.walk(source_directory):
        for file_name in files:
            source_path = os.path.join(root, file_name)
            # Create the corresponding subdirectory structure in the destination
            relative_path = os.path.relpath(source_path, source_directory)
            destination_path = os.path.join(destination_directory, relative_path)

            destination_dir = os.path.dirname(destination_path)
            if not os.path.exists(destination_dir):
                os.makedirs(destination_dir)


            # Move the file
            shutil.move(source_path, destination_path)

            # Update the progress bar
            progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

In [None]:
# move_files(source_directory, destination_directory)

In [None]:
train_dir = r"data/processed/train/"
test_dir = r"data/processed/test/"

In [None]:
sample = torch.load(train_dir + "data.pt")

In [None]:
def load_datasets(tokenizer: AutoTokenizer):
    gt = np.load("./data/token_embedding_dict.npy", allow_pickle=True)[()]
    val_dataset = GraphTextInMDataset(
        root="./data/", gt=gt, split="val", tokenizer=tokenizer
    )
    train_dataset = GraphTextInMDataset(
        root="./data/", gt=gt, split="train", tokenizer=tokenizer
    )
    return val_dataset, train_dataset

val_dataset, train_dataset = load_datasets(tokenizer)

In [None]:
sample = train_dataset[120]

In [None]:
import textwrap

In [None]:
def show_sample(sample):
    edges = sample.edge_index
    G = nx.DiGraph()

    # Add edges to the graph
    for i in range(len(edges[0])):
        G.add_edge(int(edges[0][i]), int(edges[1][i]))

    # Draw the graph
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos, with_labels=True, font_size=8, node_color='skyblue', node_size=200, edge_color='gray', linewidths=0.3, arrows=False)

    # Show the plot
    decoded_input = tokenizer.batch_decode(sample.input_ids, skip_special_tokens=True)[0]
    wrapped_text = textwrap.fill(decoded_input, width=70)

    plt.text(0, -1, wrapped_text, ha='center', va='center', fontsize=8, bbox=dict(facecolor='white', alpha=0.4))

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(2, 2, 1)
show_sample(train_dataset[41])
plt.subplot(2, 2, 2)
show_sample(train_dataset[121])
plt.subplot(2, 2, 3)
show_sample(train_dataset[452])
plt.subplot(2, 2, 4)
show_sample(train_dataset[71])

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext')

No sentence-transformers model found with name C:\Users\Antoine/.cache\torch\sentence_transformers\microsoft_BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext. Creating a new one with MEAN pooling.


In [None]:
decoded_input = tokenizer.batch_decode(train_dataset[41].input_ids, skip_special_tokens=True)[0]
print(decoded_input)
embedding = model.encode(
    decoded_input
)
print(embedding.shape)

In [4]:
import pandas as pd

In [13]:
train_ds = pd.read_csv("./data/train.tsv", sep="\t")
val_ds = pd.read_csv("./data/val.tsv", sep="\t")

# Read data/test_text.txt into a list of strings
with open("./data/test_text.txt", "r") as f:
    test_text = f.readlines()

test_text = [text.strip() for text in test_text]

In [14]:
ds = train_ds.iloc[:, 1].tolist() + val_ds.iloc[:, 1].tolist() + test_text

In [2]:
# Save to pandas dataframe
df = pd.DataFrame(ds, columns=["text"])
df.to_csv("./data/text.csv", index=False)

NameError: name 'pd' is not defined

In [10]:
# Truncate the text to 512 tokens and add overflow to new rows
df = pd.read_csv("./data/text.csv")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext')

def preprocess(samples):
    batch_size = 64

    tokenized = []
    for i in range(0, len(samples), batch_size):
        tokenized.append(tokenizer(samples["text"][i:i+batch_size].tolist(), truncation=True, max_length=256, return_overflowing_tokens=True))
    return tokenized

tokenized = preprocess(df)

In [18]:
df = pd.DataFrame(columns=["text", "input_ids", "attention_mask"])
for i in range(len(tokenized)):
    # df = df.append(pd.DataFrame({
    #     "text": tokenized[i]["input_ids"],
    #     "input_ids": tokenized[i]["input_ids"],
    #     "attention_mask": tokenized[i]["attention_mask"],
    # }))
    # Change append to concat
    df = pd.concat([df, pd.DataFrame({
        "text": tokenized[i]["input_ids"],
        "input_ids": tokenized[i]["input_ids"],
        "attention_mask": tokenized[i]["attention_mask"],
    })])
df

Unnamed: 0,text,input_ids,attention_mask
0,"[2, 22, 17, 4118, 25238, 1037, 17, 22, 17, 163...","[2, 22, 17, 4118, 25238, 1037, 17, 22, 17, 163...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[2, 19406, 1935, 2687, 1977, 1925, 4595, 16, 1...","[2, 19406, 1935, 2687, 1977, 1925, 4595, 16, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[2, 2136, 17, 2127, 17, 24, 17, 18588, 28038, ...","[2, 2136, 17, 2127, 17, 24, 17, 18588, 28038, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[2, 7843, 22, 17, 20990, 2928, 9286, 2010, 197...","[2, 7843, 22, 17, 20990, 2928, 9286, 2010, 197...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[2, 10534, 5405, 2053, 12, 21, 15, 13, 1977, 1...","[2, 10534, 5405, 2053, 12, 21, 15, 13, 1977, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
43,"[2, 12, 17659, 13, 17, 24321, 11800, 11123, 19...","[2, 12, 17659, 13, 17, 24321, 11800, 11123, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
44,"[2, 12, 24, 17, 20990, 22467, 2199, 13, 29141,...","[2, 12, 24, 17, 20990, 22467, 2199, 13, 29141,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
45,"[2, 8298, 4151, 2057, 1977, 43, 3421, 5482, 19...","[2, 8298, 4151, 2057, 1977, 43, 3421, 5482, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
46,"[2, 54, 17, 13489, 5482, 12, 21, 15, 13, 1977,...","[2, 54, 17, 13489, 5482, 12, 21, 15, 13, 1977,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [16]:
df.to_csv("./data/text_tokenized.csv", index=False)

Unnamed: 0,text,input_ids,attention_mask
0,"[2, 22, 17, 4118, 25238, 1037, 17, 22, 17, 163...","[2, 22, 17, 4118, 25238, 1037, 17, 22, 17, 163...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[2, 19406, 1935, 2687, 1977, 1925, 4595, 16, 1...","[2, 19406, 1935, 2687, 1977, 1925, 4595, 16, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[2, 2136, 17, 2127, 17, 24, 17, 18588, 28038, ...","[2, 2136, 17, 2127, 17, 24, 17, 18588, 28038, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[2, 7843, 22, 17, 20990, 2928, 9286, 2010, 197...","[2, 7843, 22, 17, 20990, 2928, 9286, 2010, 197...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[2, 10534, 5405, 2053, 12, 21, 15, 13, 1977, 1...","[2, 10534, 5405, 2053, 12, 21, 15, 13, 1977, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
43,"[2, 12, 17659, 13, 17, 24321, 11800, 11123, 19...","[2, 12, 17659, 13, 17, 24321, 11800, 11123, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
44,"[2, 12, 24, 17, 20990, 22467, 2199, 13, 29141,...","[2, 12, 24, 17, 20990, 22467, 2199, 13, 29141,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
45,"[2, 8298, 4151, 2057, 1977, 43, 3421, 5482, 19...","[2, 8298, 4151, 2057, 1977, 43, 3421, 5482, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
46,"[2, 54, 17, 13489, 5482, 12, 21, 15, 13, 1977,...","[2, 54, 17, 13489, 5482, 12, 21, 15, 13, 1977,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
