In [1]:
# This notebook makes setolabo samples as test samples. 

# Lung + colorectal + Gastric + Breast, all cancer combined training dataset. 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# Define a function to calculate median of the 6 closest non-NaN values
def fill_with_closest_median_or_zero(column):
    # Create a copy of the column to avoid modifying in place
    filled_column = column.copy()
    for idx in column[column.isna()].index:  # Find indices of NaN values
        # Get the 3 previous and 3 next valid (non-NaN) values
        window = column[max(0, idx - 3): idx + 4].dropna()
        if len(window) > 0:
            # Compute the median of up to 6 closest values
            filled_column[idx] = window.median()
        else:
            # Fill with 0 if no valid values are found
            filled_column[idx] = 0
    return filled_column

In [4]:
path = 'E:/MedBank/head_model/NGS/data_shared_by_Takeshi-san/'
df_lung = pd.read_excel(path+'Lung/Lung_PRJNA842759.xlsx')
df_colo = pd.read_csv(path+'Colorectal/Colorectal_SRP411850_PRJNA909776.csv')
df_gast = pd.read_excel(path+'gastric/PRJNA946800.xlsx')
df_breast = pd.read_csv(path+'breast/Breast_PRJNA934049_forR_edit.csv')
df_seto = pd.read_csv('E:/MedBank/head_model/NGS/data_shared_by_Takeshi-san/expression_count_test_20240807.csv')


common_miRNA = set(df_lung['miRNA']) & set(df_colo['miRNA']) & set(df_gast['miRNA']) & set(df_breast['miRNA']) & set(df_seto['gene_id'])

df_lung = df_lung[df_lung['miRNA'].isin(common_miRNA)].sort_values(by='miRNA')
df_colo = df_colo[df_colo['miRNA'].isin(common_miRNA)].sort_values(by='miRNA')
df_gast = df_gast[df_gast['miRNA'].isin(common_miRNA)].sort_values(by='miRNA')
df_gast = df_gast.drop_duplicates(subset='miRNA', keep='first')
df_breast = df_breast[df_breast['miRNA'].isin(common_miRNA)].sort_values(by='miRNA')
df_seto = df_seto[df_seto['gene_id'].isin(common_miRNA)].sort_values(by='gene_id')



# Combine all datasets column-wise while avoiding duplicate 'miRNA' columns
combined_df = pd.concat(
    [df_lung.set_index('miRNA'), 
     df_colo.set_index('miRNA'), 
     df_gast.set_index('miRNA'), 
     df_breast.set_index('miRNA')], 
    axis=1
).reset_index()

features = combined_df['miRNA']

labels_lung = [1 for _ in range(len(df_lung.columns)-1)] # All samples are cancerous (1 for lung cancer LC)

df_colo_labels = []
for col in df_colo.columns[1:]:
    if col.startswith('advanced'):
        df_colo_labels.append(2)  # Tumor only
    elif col.startswith('colorectal'):
        df_colo_labels.append(3)  # Colorectal cancer
    elif col.startswith('healthy'):
        df_colo_labels.append(4)  # Healthy control

df_gast_labels = []
for col in df_gast.columns[1:]:
    if col.startswith('case'):
        df_gast_labels.append(5)  # Gastric cancer
    elif col.startswith('control'):
        df_gast_labels.append(6)  # Healthy control

df_breast_labels = [0] * 27 + [7] * 9  # Last 9 columns are control # 0 for breast cancer

# Seto dataset labels (test set, all BC samples)
labels_seto = [0 for _ in range(5)]
labels_train = labels_lung + df_colo_labels + df_gast_labels + df_breast_labels





# Transpose the DataFrames and reset their index
combined_df = combined_df.T
df_seto = df_seto.T
combined_df.reset_index(drop=True, inplace=True)
df_seto.reset_index(drop=True, inplace=True)

# Update the column names after transposing
df_seto.columns = df_seto.iloc[0].tolist()
df_seto = df_seto[1:]
combined_df.columns = combined_df.iloc[0].tolist()
combined_df = combined_df[1:]

# Assign labels to the GRE and seto datasets
combined_df['labels'] = labels_train
df_seto['labels'] = labels_seto

# No train-test split, combined_df is the full training set and df_seto is the testing set
df_train = combined_df.iloc[:, :-1]  # Features of the full training set
df_train = df_train.apply(fill_with_closest_median_or_zero)
df_test = df_seto.iloc[:, :-1]  # Features of the full testing set

# Create labels_train and labels_test
labels_train = combined_df['labels'].tolist()
labels_test = df_seto['labels'].tolist()

In [6]:
# Define NGS version as a variable
folder_path = "E:/MedBank/head_model/NGS/data_shared_by_Takeshi-san/model_ready_folders/setolabo_samples_as_test/LCGB2"  # You can change this value as needed

# Paths for train and test feature vectors
file_path = f"{folder_path}/train/feature_vectors.csv"
df_train.to_csv(file_path, index=False, header=False)

file_path = f"{folder_path}/test/feature_vectors.csv"
df_test.to_csv(file_path, index=False, header=False)

##--------------------------------------------------------

# Paths for train and test labels
file_path = f"{folder_path}/train/labels.txt"
with open(file_path, 'w') as file:
    for label in labels_train:
        file.write(f"{label}\n")

file_path = f"{folder_path}/test/labels.txt"
with open(file_path, 'w') as file:
    for label in labels_test:
        file.write(f"{label}\n")

##--------------------------------------------------------

# Path for feature names
file_path = f"{folder_path}/feature_names.txt"
with open(file_path, 'w') as file:
    for label in features:
        file.write(f"{label}\n")