In [153]:
import pickle
import time as timer
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

In [154]:
#model_path = "/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_preproc/iter_0"
model_path = "/export/usuarios_ml4ds/lbartolome/Datasets/Cancer/models_preproc/iter_0"

## Load corpus

In [155]:
def mallet_corpus_to_df(corpusFile: pathlib.Path):
    """Converts a Mallet corpus file (i.e., file required for the Mallet import command) to a pandas DataFrame

    Parameters
    ----------
    corpusFile: pathlib.Path
        Path to the Mallet corpus file

    Returns
    -------
    :   pandas.DataFrame
        DataFrame with the corpus
    """

    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    indexes = [line.rsplit(' 0 ')[0].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    corpus_dict = {
        'id': indexes,
        'text': corpus
    }
    return pd.DataFrame(corpus_dict)

In [156]:
path_corpus = pathlib.Path(model_path) / "corpus.txt"
df = mallet_corpus_to_df(path_corpus)
df

Unnamed: 0,id,text
0,44901896,ribonucleic_acid meet train jena participant j...
1,46156343,prostate degarelix gonadotropin_release hormon...
2,29070965,intestinal_microflora necessary spontaneous ad...
3,22277284,identification co_repressor inhibitor transcri...
4,218582103,immune modulation properties zoledronic_acid t...
...,...,...
1505173,18071839,messenger_ribonucleic acid electroporation eff...
1505174,33987835,frequency mismatch_repair deficiency pediatric...
1505175,23158902,silence enhances vitro vivo osteogenic_differe...
1505176,201042517,phase_ib durvalumab combination trastuzumab me...


In [157]:
print(f"Kept {int((len(df) / 100))} docus")

Kept 15051 docus


In [137]:
df = df.iloc[0: int((len(df) / 100))]

In [138]:
df

Unnamed: 0,id,text
0,234639473,remark speech_recognition tea sources sensor s...
1,61407153,signal improvement holographic storage adaptiv...
2,255080817,data_drive condition monitoring industrial equ...
3,229290883,gaze self identification play computer_vision ...
4,51877396,voices social communities case kaohsiung advan...
...,...,...
15693,251946880,molecular automated speech_recognition optical...
15694,219860778,probabilistic sequential optimization cnn netw...
15695,60313186,motion coordination robots study cooperative b...
15696,54449907,genetic_algorithm fusion grammar level multimo...


### Prepare corpus for topmost

In [139]:
len(df) == (len(train_df) + len(test_df))

False

In [140]:
train_df

Unnamed: 0,id,text
5446,28962572,phase multicenter blind sequential dose_escala...
6843,11770120,evolution vitamin vitamin_d second decade cent...
12957,1198705,lung non_hodgkin_lymphoma lymphoma elderly non...
8759,16837459,tolfenamic_acid pancreatic radiotherapy inhibi...
5168,25687389,maspin pleomorphic_adenoma aim presence distri...
...,...,...
5191,207574910,preserve diagnosis dual source computed_tomogr...
13418,23415724,sex_cord stromal investigation retrospective s...
5390,38957431,hypereosinophilic paraneoplastic_syndrome comp...
860,212640133,update implication cyclin melanoma cyclin prot...


In [128]:
import json
from sklearn.model_selection import train_test_split
import random

# Split the data into train and test sets (e.g., 80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

possible_labels = ["A", "B", "C"]

# Convert the train and test DataFrames to lists of JSON objects with random labels
train_json = train_df.apply(
    lambda row: {"label": random.choice(possible_labels), "text": row['text']}, axis=1
).tolist()
test_json = test_df.apply(
    lambda row: {"label": random.choice(possible_labels), "text": row['text']}, axis=1
).tolist()

# Save the list to a JSON file
output_path = '/export/usuarios_ml4ds/lbartolome/Datasets/Cancer/models_preproc/iter_0/topmost/'
# Save the train and test JSON lists to separate files
train_output_path = output_path + 'train.jsonlist'
test_output_path = output_path + 'test.jsonlist'

with open(train_output_path, 'w') as train_file:
    for item in train_json:
        train_file.write(json.dumps(item) + '\n')

with open(test_output_path, 'w') as test_file:
    for item in test_json:
        test_file.write(json.dumps(item) + '\n')

print(f"Training data saved to {train_output_path}")
print(f"Testing data saved to {test_output_path}")

Training data saved to /export/usuarios_ml4ds/lbartolome/Datasets/Cancer/models_preproc/iter_0/topmost/train.jsonlist
Testing data saved to /export/usuarios_ml4ds/lbartolome/Datasets/Cancer/models_preproc/iter_0/topmost/test.jsonlist


## Load vocabulary and BoW

In [129]:
vocab_w2id = {}
with (pathlib.Path(model_path)/'vocabulary.txt').open('r', encoding='utf8') as fin:
    for i, line in enumerate(fin):
        wd = line.strip()
        vocab_w2id[wd] = i

In [130]:
len(vocab_w2id)

100000

In [131]:
vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
bow = vectorizer.fit_transform(df.text.values.tolist())
bow = bow.toarray()



In [132]:
bow.shape

(15051, 50711)