In [None]:
import pickle
from sklearn.model_selection import train_test_split
from pathlib import Path
from transformers import AutoTokenizer

RANDOM_STATE = 42

In [None]:
tokenizer = lambda x: x.split()
# tokenizer = AutoTokenizer.from_pretrained("albert-base-v2").tokenize

In [None]:
input_dir = "/Users/johngiorgi/Desktop/biorxiv_dump"
output_dir = "/Users/johngiorgi/Documents/dev/t2t/datasets/biorxiv"

In [None]:
input_dir = Path(input_dir)
scraped_content = {}
# This will drop ~8% of all articles in bioRxiv
min_word_count = 25

for path in input_dir.iterdir():
    if not path.name.endswith(".pickle"):
        continue
    with open(path, "rb") as f:
        scraped_content.update(pickle.load(f))

In [None]:
x, y = [], []
scraped_content_cleaned = {}

for doi, content in scraped_content.items():
    # Remove whitespace, newlines and tabs
    abstract = ' '.join(content['abstract'].split())

    # This will drop ~8% of all articles in bioRxiv
    if len(tokenizer(content["abstract"])) < min_word_count:
        continue
        
    scraped_content_cleaned[doi] = content
    x.append(abstract)
    y.append(content["subject_area"])
    
    
print(f"Retained {len(x)}/{len(scraped_content)} ({len(x)/len(scraped_content):.2%}) of articles after filtering for a length of {min_word_count} tokens.")

Retained 59178/59598 (99.30%) of articles after filtering for a length of 25 tokens.


In [None]:
train_size = 0.80
valid_size = 0.10
test_size = 0.10

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=1-train_size,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y
)
x_valid, x_test, y_valid, y_test = train_test_split(
    x_test, y_test, test_size=test_size/(test_size + valid_size),
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_test
)

In [None]:
assert len(x_train) == len(y_train)
assert len(x_valid) == len(y_valid)
assert len(x_test) == len(y_test)

In [None]:
output_dir = Path(output_dir)

with open(output_dir / "train.txt", "w") as f:
    f.write('\n'.join(x_train))
with open(output_dir / "valid.txt", "w") as f:
    f.write('\n'.join(x_valid))
with open(output_dir / "test.txt", "w") as f:
    f.write('\n'.join(x_test))
with open(output_dir / "train_labels.txt", "w") as f:
    f.write('\n'.join(y_train))
with open(output_dir / "valid_labels.txt", "w") as f:
    f.write('\n'.join(y_valid))
with open(output_dir / "test_labels.txt", "w") as f:
    f.write('\n'.join(y_test))

[]

In [None]:
with open(Path(input_dir) / "scraped_content_clean.pickle", "wb") as f:
    pickle.dump(scraped_content_cleaned, f)

In [None]:
len(x)

59178

In [None]:
[ab for ab in x if not ab]

[]

In [None]:
import json
embeddings = []
with open("/Users/johngiorgi/Documents/dev/t2t/datasets/biorxiv/embeddings.jsonl", "r") as f:
    for idx, line in enumerate(f):
        content = json.loads(line)
        embeddings.append(content["doc_embeddings"])
assert len(scraped_content_cleaned) == len(embeddings)

In [None]:
for doi, embedding in zip(scraped_content_cleaned, embeddings):
    scraped_content_cleaned[doi]["w2v"] = embedding

In [None]:
with open(Path(input_dir) / "scraped_content_clean.pickle", "wb") as f:
    pickle.dump(scraped_content_cleaned, f)