In [1]:
import os
import pyreadr
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

# 0. Read data

In [2]:
data_dir = os.path.join("..", "..", "data", "raw")

result_pubmed = pyreadr.read_r(os.path.join(data_dir, "pubmed.rds"))
df_pubmed = list(result_pubmed.values())[0]

result_autoreg = pyreadr.read_r(os.path.join(data_dir, "autoregulatoryDB.rds"))
df_autoreg = list(result_autoreg.values())[0]

print("df_pubmed shape:", df_pubmed.shape)
print("df_autoreg shape:", df_autoreg.shape)

df_pubmed shape: (262819, 5)
df_autoreg shape: (1323976, 13)


In [3]:
df_pubmed.head()

Unnamed: 0,PMID,Title,Abstract,Journal,Authors
0,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,Virology,"Wendy G H Tan, Todd J Barkman, V Gregory Chinc..."
1,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,Journal of virology,"Gustavo Delhon, Edan R Tulman, Claudio L Afons..."
2,11448171,Analysis of the first complete DNA sequence of...,"Chilo iridescent virus (CIV), the type species...",Virology,"N J Jakob, K Müller, U Bahr, G Darai"
3,17239238,Comparative genomic analysis of the family Iri...,Members of the family Iridoviridae can cause s...,Virology journal,"Heather E Eaton, Julie Metcalf, Emily Penny, V..."
4,20633916,"Frog virus 3 ORF 53R, a putative myristoylated...",Although previous work identified 12 complemen...,Virology,"Dexter S Whitley, Kwang Yu, Robert C Sample, A..."


In [4]:

missing_abstracts = df_pubmed['Abstract'].isna() | (df_pubmed['Abstract'].str.strip() == '')

df_pubmed[missing_abstracts]


Unnamed: 0,PMID,Title,Abstract,Journal,Authors
12,8310077,Nucleotide sequence of a stamen- and tapetum-s...,,Plant physiology,"R Chen, A G Smith"
37,17556061,Identification of 2 new sesame seed allergens:...,,The Journal of allergy and clinical immunology,"Kirsten Beyer, Galina Grishina, Ludmilla Bardi..."
70,7972511,Five cDNAs encoding Arabidopsis GF14 proteins.,,Plant physiology,"G Lu, M F Rooney, K Wu, R J Ferl"
71,7870824,Sequences of three Arabidopsis general regulat...,,Plant physiology,"M F Rooney, R J Ferl"
94,8577340,Stage-specific expression of the mRNA encoding...,,Molecular and biochemical parasitology,"D Schechtman, D Ram, R Tarrab-Hazdai, R Arnon,..."
...,...,...,...,...,...
262515,10612253,Physical and linkage mapping of the bovine zin...,,Animal genetics,"I Tammen, W C Warren, H W Raadsma"
262598,21387411,Solution structure of the second PDZ domain of...,,Proteins,"Peng Ji, Guang Yang, Jiahai Zhang, Jiawen Wu, ..."
262667,2541416,Nucleotide sequence of the gene encoding zona ...,,Nucleic acids research,"R A Kinloch, P M Wassarman"
262678,8518738,A trefoil domain in the major rabbit zona pell...,,Protein science : a publication of the Protein...,P Bork


In [5]:
df_autoreg.head()

Unnamed: 0,AC,OS,RN,RP,RC,RX,RG,RA,RT,RL,Term_in_RP,Term_in_RT,Term_in_RC
0,Q6GZX4,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,
1,Q6GZX3,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,
2,Q197F8,Invertebrate iridescent virus 3 (IIV-3) (Mosqu...,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=16912294; DOI=10.1128/jvi.00464-06;,,"Delhon G., Tulman E.R., Afonso C.L., Lu Z., Be...","""Genome of invertebrate iridescent virus type ...",J. Virol. 80:8439-8449(2006).,,,
3,Q197F7,Invertebrate iridescent virus 3 (IIV-3) (Mosqu...,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=16912294; DOI=10.1128/jvi.00464-06;,,"Delhon G., Tulman E.R., Afonso C.L., Lu Z., Be...","""Genome of invertebrate iridescent virus type ...",J. Virol. 80:8439-8449(2006).,,,
4,Q6GZX2,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,


# 1. Data Cleaning

Join these two dataframes

In [6]:
# extract PMID from RX column and join with pubmed data
df_autoreg['PMID'] = df_autoreg['RX'].str.extract(r'PubMed=(\d+)')

df_autoreg['PMID'] = df_autoreg['PMID'].astype(str)
df_pubmed['PMID'] = df_pubmed['PMID'].astype(str)

df_merged = pd.merge(df_autoreg, df_pubmed, on='PMID', how='left')

print(df_merged.shape)
df_merged.head()

(1323976, 18)


Unnamed: 0,AC,OS,RN,RP,RC,RX,RG,RA,RT,RL,Term_in_RP,Term_in_RT,Term_in_RC,PMID,Title,Abstract,Journal,Authors
0,Q6GZX4,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,Virology,"Wendy G H Tan, Todd J Barkman, V Gregory Chinc..."
1,Q6GZX3,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,Virology,"Wendy G H Tan, Todd J Barkman, V Gregory Chinc..."
2,Q197F8,Invertebrate iridescent virus 3 (IIV-3) (Mosqu...,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=16912294; DOI=10.1128/jvi.00464-06;,,"Delhon G., Tulman E.R., Afonso C.L., Lu Z., Be...","""Genome of invertebrate iridescent virus type ...",J. Virol. 80:8439-8449(2006).,,,,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,Journal of virology,"Gustavo Delhon, Edan R Tulman, Claudio L Afons..."
3,Q197F7,Invertebrate iridescent virus 3 (IIV-3) (Mosqu...,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=16912294; DOI=10.1128/jvi.00464-06;,,"Delhon G., Tulman E.R., Afonso C.L., Lu Z., Be...","""Genome of invertebrate iridescent virus type ...",J. Virol. 80:8439-8449(2006).,,,,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,Journal of virology,"Gustavo Delhon, Edan R Tulman, Claudio L Afons..."
4,Q6GZX2,Frog virus 3 (isolate Goorha) (FV-3),[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,"Tan W.G., Barkman T.J., Gregory Chinchar V., E...","""Comparative genomic analyses of frog virus 3,...",Virology 323:70-84(2004).,,,,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,Virology,"Wendy G H Tan, Todd J Barkman, V Gregory Chinc..."


Since we are not going to use all these columns, we droped unwanted ones

In [7]:
columns_to_keep = ['AC', 'PMID', 'Title', 'Abstract', 'Term_in_RP', 'Term_in_RT', 'Term_in_RC']

df_selected = df_merged[columns_to_keep].copy()

These three 'Term_xxx' columns can be combined into one column

In [8]:
def merge_terms(row):
    cols = ['Term_in_RP', 'Term_in_RT', 'Term_in_RC']
    terms = []

    for col in cols:
        val = row[col]
        if pd.notna(val):
            split_terms = [t.strip() for t in str(val).split(',') if t.strip()]
            terms.extend(split_terms)

    return ', '.join(sorted(set(terms))) if terms else ''


df_selected['Terms'] = df_selected.apply(merge_terms, axis=1)

print(df_selected.shape)

(1323976, 8)


After combining all these three 'Terms_' column, we got our final 'Terms' column so that can be used later when we build model. And we can now drop these three 'Terms_' columns and only keep the final one

In [9]:
# number of rows that has multiple 'Terms'
df_selected[df_selected['Terms'].str.contains(',')]['Terms'].count()

np.int64(37)

In [10]:
df_cleaned = df_selected.drop(columns=['Term_in_RP', 'Term_in_RT', 'Term_in_RC'])
print(df_cleaned.shape)
df_cleaned[df_cleaned['Terms'] != ''].head()

(1323976, 5)


Unnamed: 0,AC,PMID,Title,Abstract,Terms
1085,P63104,29357390,Herpesvirus deconjugases inhibit the IFN respo...,The N-terminal domains of the herpesvirus larg...,autoubiquitination
5416,Q64264,18599790,Sporadic autonomic dysregulation and death ass...,Sudden infant death syndrome is the leading ca...,autoinhibition
5891,Q9Y6E2,29470543,Translational autoregulation of BZW1 and BZW2 ...,The efficiency of start codon selection during...,autoregulation
5926,Q7L1Q6,29470543,Translational autoregulation of BZW1 and BZW2 ...,The efficiency of start codon selection during...,autoregulation
9655,Q13131,17088252,Conserved alpha-helix acts as autoinhibitory s...,AMP-activated protein kinase (AMPK) acts as an...,autoinhibitory


In [11]:
df_cleaned

Unnamed: 0,AC,PMID,Title,Abstract,Terms
0,Q6GZX4,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,
1,Q6GZX3,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,
2,Q197F8,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,
3,Q197F7,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,
4,Q6GZX2,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,
...,...,...,...,...,...
1323971,Q88470,12970423,Tacaribe virus Z protein interacts with the L ...,Tacaribe virus (TV) is the prototype of the Ne...,
1323972,A9JR22,17624390,Principal host relationships and evolutionary ...,A previous study suggested that the genomes of...,
1323973,A9JR22,,,,
1323974,A9JR22,18602020,Phylogeny of the genus Arenavirus.,The family Arenaviridae consists of a unique g...,


#### The class distribution is very imbalanced, autophosphorylation: 850 samples (~46.6%). Many classes have < 5 samples, some only 1.

## Preprocessing Text for Modeling

In [12]:
import re
import pandas as pd

# Step 1: Drop rows with missing or empty Title or Abstract
df_cleaned = df_cleaned.dropna(subset=['Title', 'Abstract'])
df_cleaned= df_cleaned[
    df_cleaned['Title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0) &
    df_cleaned['Abstract'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)
].copy()

# Step 2: Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)             # remove extra whitespace/newlines
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)   # remove non-printable characters
    return text

# Step 3: Apply preprocessing to Title and Abstract
df_cleaned['Title_clean'] = df_cleaned['Title'].apply(preprocess_text)
df_cleaned['Abstract_clean'] = df_cleaned['Abstract'].apply(preprocess_text)

# Step 4: Concatenate cleaned Title and Abstract
df_cleaned['Text_combined'] = df_cleaned['Title_clean'] + " " + df_cleaned['Abstract_clean']

# Step 5: Sanity check
print("Shape after cleaning:", df_cleaned.shape)
print(df_cleaned[['Title_clean', 'Abstract_clean', 'Text_combined']].head(3))


Shape after cleaning: (1118080, 8)
                                         Title_clean  \
0  Comparative genomic analyses of frog virus 3, ...   
1  Comparative genomic analyses of frog virus 3, ...   
2  Genome of invertebrate iridescent virus type 3...   

                                      Abstract_clean  \
0  Frog virus 3 (FV3) is the type species member ...   
1  Frog virus 3 (FV3) is the type species member ...   
2  Iridoviruses (IVs) are classified into five ge...   

                                       Text_combined  
0  Comparative genomic analyses of frog virus 3, ...  
1  Comparative genomic analyses of frog virus 3, ...  
2  Genome of invertebrate iridescent virus type 3...  


In [13]:
# Check IF Abstract_clean is preprocessed and clean to used
df_cleaned['Abstract_clean'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0).all()


np.True_

In [14]:
# Check IF Title is preprocessed and clean to used
df_cleaned['Title'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0).all()

np.True_

In [15]:
# Check IF Terms is preprocessed and clean to used
df_cleaned['Terms'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0).all()


np.False_

In [16]:
df_cleaned.shape[0]

1118080

### Binarize the Multi-Label Terms

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split terms into lists
df_cleaned['Term_list'] = df_cleaned['Terms'].apply(
    lambda x: [t.strip() for t in x.split(',')] if isinstance(x, str) else []
)

# Initialize and fit binarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_cleaned['Term_list'])

# Save label classes for later use
label_classes = mlb.classes_


In [18]:
label_classes 

array(['', 'autoactivation', 'autocatalysis', 'autocatalytic',
       'autofeedback', 'autoinducer', 'autoinduction', 'autoinhibition',
       'autoinhibitory', 'autokinase', 'autolysis', 'autophosphatase',
       'autophosphorylation', 'autoregulation', 'autoregulatory',
       'autoubiquitination'], dtype=object)

In [19]:
print(Y.shape)  # (1781, num_classes)



(1118080, 16)


In [20]:
label_counts = pd.Series(Y.sum(axis=0), index=label_classes).sort_values(ascending=False)

print(label_counts)

                       1116299
autophosphorylation        849
autocatalytic              177
autoregulation             154
autoubiquitination         146
autoinhibition             137
autoregulatory              85
autoinducer                 73
autolysis                   70
autoinhibitory              60
autoactivation              22
autocatalysis               15
autofeedback                13
autoinduction               11
autokinase                   8
autophosphatase              1
dtype: int64


In [21]:
# If a term appears only once it’s unlikely the model will learn meaningful patterns. It may bias the loss function if class weights are applied

term_to_drop = "autophosphatase"  

# Find index 
drop_idx = list(label_classes).index(term_to_drop)

# Drop the corresponding column from Y
Y = np.delete(Y, drop_idx, axis=1)

# Remove the term from label_classes
label_classes = [label for i, label in enumerate(label_classes) if i != drop_idx]

print(Y.shape)  # (1781, num_classes)


(1118080, 15)


In [22]:
df = df_cleaned

In [23]:
df

Unnamed: 0,AC,PMID,Title,Abstract,Terms,Title_clean,Abstract_clean,Text_combined,Term_list
0,Q6GZX4,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,"Comparative genomic analyses of frog virus 3, ...",[]
1,Q6GZX3,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,"Comparative genomic analyses of frog virus 3, ...",[]
2,Q197F8,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,Genome of invertebrate iridescent virus type 3...,[]
3,Q197F7,16912294,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,,Genome of invertebrate iridescent virus type 3...,Iridoviruses (IVs) are classified into five ge...,Genome of invertebrate iridescent virus type 3...,[]
4,Q6GZX2,15165820,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,,"Comparative genomic analyses of frog virus 3, ...",Frog virus 3 (FV3) is the type species member ...,"Comparative genomic analyses of frog virus 3, ...",[]
...,...,...,...,...,...,...,...,...,...
1323970,Q88470,2510403,The 5' region of Tacaribe virus L RNA encodes ...,We have just completed the Tacaribe arenavirus...,,The 5' region of Tacaribe virus L RNA encodes ...,We have just completed the Tacaribe arenavirus...,The 5' region of Tacaribe virus L RNA encodes ...,[]
1323971,Q88470,12970423,Tacaribe virus Z protein interacts with the L ...,Tacaribe virus (TV) is the prototype of the Ne...,,Tacaribe virus Z protein interacts with the L ...,Tacaribe virus (TV) is the prototype of the Ne...,Tacaribe virus Z protein interacts with the L ...,[]
1323972,A9JR22,17624390,Principal host relationships and evolutionary ...,A previous study suggested that the genomes of...,,Principal host relationships and evolutionary ...,A previous study suggested that the genomes of...,Principal host relationships and evolutionary ...,[]
1323974,A9JR22,18602020,Phylogeny of the genus Arenavirus.,The family Arenaviridae consists of a unique g...,,Phylogeny of the genus Arenavirus.,The family Arenaviridae consists of a unique g...,Phylogeny of the genus Arenavirus. The family ...,[]


In [24]:
def create_balanced_dataset(df, ratio=2):
    """
    Process a dataset to create a balanced dataset with a specified ratio of unlabeled to labeled data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing labeled and unlabeled data
    ratio : int, default=2
        The ratio of unlabeled to labeled samples in the final dataset
        
    Returns:
    --------
    pandas.DataFrame
        The final balanced dataset
    """
    print("\nStep 2: Separate labeled and unlabeled data")
    # Consider both NaN and empty strings as unlabeled
    df_labeled = df[(df['Terms'].notna()) & (df['Terms'] != '')].reset_index(drop=True)
    df_unlabeled = df[(df['Terms'].isna()) | (df['Terms'] == '')].reset_index(drop=True)
    print(f"Labeled data shape: {df_labeled.shape}")
    print(f"Unlabeled data shape: {df_unlabeled.shape}")
    
    print("\nStep 3: Shuffle the labeled and unlabeled datasets")
    df_labeled = df_labeled.sample(frac=1, random_state=42).reset_index(drop=True)
    df_unlabeled = df_unlabeled.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print("\nStep 4: Create a 2:1 ratio of unlabeled to labeled data")
    # Calculate how many unlabeled samples we need
    num_labeled = len(df_labeled)
    num_unlabeled_needed = num_labeled * ratio
    
    # Check if we have enough unlabeled samples
    if num_unlabeled_needed > len(df_unlabeled):
        print(f"Warning: Not enough unlabeled samples. Need {num_unlabeled_needed}, but only have {len(df_unlabeled)}.")
        print(f"Using all available unlabeled samples ({len(df_unlabeled)}).")
        df_unlabeled_selected = df_unlabeled
    else:
        print(f"Selecting {num_unlabeled_needed} unlabeled samples out of {len(df_unlabeled)} available.")
        df_unlabeled_selected = df_unlabeled.iloc[:num_unlabeled_needed]
    
    print("\nStep 5: Concatenate labeled and unlabeled data")
    final_data = pd.concat([df_labeled, df_unlabeled_selected], ignore_index=True)
    
    print("\nStep 6: Final shuffle")
    final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print("\nStep 7: Final dataset summary")
    labeled_count = sum((final_data['Terms'].notna()) & (final_data['Terms'] != ''))
    unlabeled_count = len(final_data) - labeled_count
    print(f"Final dataset shape: {final_data.shape}")
    print(f"Number of labeled samples: {labeled_count}")
    print(f"Number of unlabeled samples: {unlabeled_count}")
    print(f"Ratio of unlabeled to labeled: {unlabeled_count / labeled_count:.2f}:1")
    
    return final_data


In [25]:
    # Process the dataset
final_data = create_balanced_dataset(df, ratio=2)
    
    # Display sample of the final dataset
print("\nSample of final dataset:")
print(final_data.head())
    
    # Optional: Save the final dataset
    # final_data.to_csv('balanced_dataset.csv', index=False)


Step 2: Separate labeled and unlabeled data
Labeled data shape: (1781, 9)
Unlabeled data shape: (1116299, 9)

Step 3: Shuffle the labeled and unlabeled datasets

Step 4: Create a 2:1 ratio of unlabeled to labeled data
Selecting 3562 unlabeled samples out of 1116299 available.

Step 5: Concatenate labeled and unlabeled data

Step 6: Final shuffle

Step 7: Final dataset summary
Final dataset shape: (5343, 9)
Number of labeled samples: 1781
Number of unlabeled samples: 3562
Ratio of unlabeled to labeled: 2.00:1

Sample of final dataset:
       AC      PMID                                              Title  \
0  Q9FN94  19124768  Tyrosine phosphorylation of the BRI1 receptor ...   
1  Q06219  20159955  The mammalian clock component PERIOD2 coordina...   
2  Q14129  15461802  A genome annotation-driven approach to cloning...   
3  Q59WV0  15123810   The diploid genome sequence of Candida albicans.   
4  O75534  16356927  The autoregulatory translational control eleme...   

              

In [26]:
final_data.head()

Unnamed: 0,AC,PMID,Title,Abstract,Terms,Title_clean,Abstract_clean,Text_combined,Term_list
0,Q9FN94,19124768,Tyrosine phosphorylation of the BRI1 receptor ...,Brassinosteroids (BRs) are essential growth-pr...,autophosphorylation,Tyrosine phosphorylation of the BRI1 receptor ...,Brassinosteroids (BRs) are essential growth-pr...,Tyrosine phosphorylation of the BRI1 receptor ...,[autophosphorylation]
1,Q06219,20159955,The mammalian clock component PERIOD2 coordina...,Mammalian circadian clocks provide a temporal ...,,The mammalian clock component PERIOD2 coordina...,Mammalian circadian clocks provide a temporal ...,The mammalian clock component PERIOD2 coordina...,[]
2,Q14129,15461802,A genome annotation-driven approach to cloning...,We have developed a systematic approach to gen...,,A genome annotation-driven approach to cloning...,We have developed a systematic approach to gen...,A genome annotation-driven approach to cloning...,[]
3,Q59WV0,15123810,The diploid genome sequence of Candida albicans.,We present the diploid genome sequence of the ...,,The diploid genome sequence of Candida albicans.,We present the diploid genome sequence of the ...,The diploid genome sequence of Candida albican...,[]
4,O75534,16356927,The autoregulatory translational control eleme...,Repression of poly(A)-binding protein (PABP) m...,autoregulatory,The autoregulatory translational control eleme...,Repression of poly(A)-binding protein (PABP) m...,The autoregulatory translational control eleme...,[autoregulatory]


In [27]:
# Define the output directory
output_dir = os.path.join("data", "preprocessed")

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the shuffled dataset to a CSV file
final_data.to_csv(os.path.join(output_dir, "shuffled_data.csv"), index=False)

print("Shuffled data has been saved to:", os.path.join(output_dir, "shuffled_data.csv"))


Shuffled data has been saved to: data/preprocessed/shuffled_data.csv
