In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Liver_cirrhosis/GSE78160'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Identification of a serum 48-lncRNA signature as diagnostic marker for hepatocellular carcinoma and liver cirrhosis"
!Series_summary	"In cancer management, early and accurate diagnosis of hepatocellular carcinoma (HCC) is important for enhancing survival rate of patients. Currently, serum alpha-fetoprotein (AFP) is the only one biomarker for detection of HCC. However, serum AFP is not satisfactory for diagnosis of HCC due to its low accuracy (about 60-70%). In this study, we collected 109 serum samples (discovery set) from healthy control (HC) and patients with chronic hepatitis B (CHB), liver cirrhosis (LC) and HCC, and analyzed them with custom lncRNA microarray. Profiling analysis shows 181 differentially expressed lncRNAs between HCs and patients with CHB, LC and HCC. Then a 48-lncRNA diagnostic signature was identified with 100% predictive accuracy for all subjects in the discovery set. This diagnostic signature was verified with a cross-vali

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset contains gene expression data 
gene_expression_data_key = 0  # Let's assume gene expression data is often the first key in matrix files

# Based on the sample characteristics dictionary, it seems to be a gene expression dataset
is_gene_available = True if 'lncRNA microarray' in sample_characteristics_dict[gene_expression_data_key][0] else False

# Search for the availability and appropriate keys for 'Liver_cirrhosis', 'age', and 'gender'
for key, values in sample_characteristics_dict.items():
    if 'disease state: liver cirrhosis' in values:
        trait_row = key
    elif 'age' in values[0]:
        age_row = key
    elif 'gender: male' in values or 'gender: female' in values:
        gender_row = key

# Define conversion functions
def convert_trait(value):
    value = value.split(':')[-1].strip().lower()
    if value == 'liver cirrhosis':
        return 1
    elif value == 'healthy control':
        return 0
    return None

def convert_age(value):
    try:
        return float(value.split(':')[-1].strip())
    except ValueError:
        return None

def convert_gender(value):
    value = value.split(':')[-1].strip().lower()
    if value == 'female':
        return 0
    elif value == 'male':
        return 1
    return None

# Save cohort information
save_cohort_info('GSE78160', './preprocessed/Liver_cirrhosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Liver_cirrhosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Liver_cirrhosis/trait_data/GSE78160.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2068378': [0], 'GSM2068379': [0], 'GSM2068380': [0], 'GSM2068381': [0], 'GSM2068382': [0], 'GSM2068383': [0], 'GSM2068384': [0], 'GSM2068385': [0], 'GSM2068386': [0], 'GSM2068387': [0], 'GSM2068388': [0], 'GSM2068389': [0], 'GSM2068390': [0], 'GSM2068391': [0], 'GSM2068392': [0], 'GSM2068393': [0], 'GSM2068394': [0], 'GSM2068395': [0], 'GSM2068396': [0], 'GSM2068397': [0], 'GSM2068398': [0], 'GSM2068399': [0], 'GSM2068400': [0], 'GSM2068401': [None], 'GSM2068402': [None], 'GSM2068403': [None], 'GSM2068404': [None], 'GSM2068405': [None], 'GSM2068406': [None], 'GSM2068407': [None], 'GSM2068408': [None], 'GSM2068409': [None], 'GSM2068410': [None], 'GSM2068411': [None], 'GSM2068412': [None], 'GSM2068413': [None], 'GSM2068414': [None], 'GSM2068415': [None], 'GSM2068416': [None], 'GSM2068417': [None], 'GSM2068418': [None], 'GSM2068419': [None], 'GSM2068420': [None], 'GSM2068421': [None], 'GSM2068422': [None], 'GSM2068423': [None], 'GSM2068424': [None], 'GSM2068425': [None], 'GSM2068426'