In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Colon_and_Rectal_Cancer/GSE115513'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNA and colorectal cancer:  associations with tumor phenotype and survival"
!Series_summary	"MicroRNAs (miRNA) are a class of small regulatory RNAs that mediate post-transcriptional silencing of specific target mRNAs.  Data suggest the importance of miRNAs to cancer development and possibly to survival.  Our overall hypothesis is that miRNA expression is unique to tumor molecular phenotype; that miRNA expression levels at time of diagnosis predicts survival; and that miRNA expression is associated with inflammation-related genetic and lifestyle factors key to colorectal cancer (CRC). This study takes a two pronged approach to addressing our hypotheses.  While we propose to validate previously identified miRNAs that have been identified as associated with CRC (either by differential expression or from assessment of mutations), we will add to the field through discovery of new and important associations that may be unique to specific molecular phe

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
is_gene_available = False  # Correct based on dataset content

# 2.1 Data Availability

# Exploring possible inferences for the Colon_and_Rectal_Cancer trait based on tissue types
if any(['tissue: Carcinoma' in values or 'tissue: Adenoma' in values for values in sample_characteristics_dict.values()]):
    trait_row = 2
else:
    trait_row = None

# Age proper availability identified as key 3 with 'age_at_diagnosis'
age_row = 3  

# Gender proper availability identified as key 4 with 'Sex'
gender_row = 4  

# 2.3 Data Type Conversion

# Convert trait (presence of cancer or not, Infer Carcinoma is '1', others '0')
def convert_trait(value):
    tissue_type = value.split(': ')[1].lower()
    return 1 if 'carcinoma' in tissue_type else 0 if 'adenoma' in tissue_type else None

# Convert age to continuous value
def convert_age(value):
    try:
        return int(value.split(': ')[1][:-1])  # Extracts the number before 'y'
    except:
        return None

# Convert gender to binary (0 for female, 1 for male)
def convert_gender(value):
    gender_str = value.split(': ')[1]
    return 1 if gender_str.lower() == 'male' else 0 if gender_str.lower() == 'female' else None

# 3. Save Metadata
save_cohort_info('GSE115513', './preprocessed/Colon_and_Rectal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Colon_and_Rectal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Colon_and_Rectal_Cancer/trait_data/GSE115513.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3179852': [1, 59, 1], 'GSM3179853': [None, 56, 1], 'GSM3179854': [1, 56, 1], 'GSM3179855': [1, 69, 0], 'GSM3179856': [None, 73, 0], 'GSM3179857': [1, 73, 0], 'GSM3179858': [1, 76, 0], 'GSM3179859': [None, 61, 0], 'GSM3179860': [1, 61, 0], 'GSM3179861': [None, 79, 1], 'GSM3179862': [1, 79, 1], 'GSM3179863': [None, 77, 1], 'GSM3179864': [1, 77, 1], 'GSM3179865': [None, 70, 0], 'GSM3179866': [1, 70, 0], 'GSM3179867': [1, 67, 0], 'GSM3179868': [None, 37, 1], 'GSM3179869': [1, 37, 1], 'GSM3179870': [None, 69, 1], 'GSM3179871': [1, 69, 1], 'GSM3179872': [None, 79, 0], 'GSM3179873': [1, 79, 0], 'GSM3179874': [None, 74, 0], 'GSM3179875': [1, 74, 0], 'GSM3179876': [None, 69, 0], 'GSM3179877': [1, 69, 0], 'GSM3179878': [None, 55, 0], 'GSM3179879': [1, 55, 0], 'GSM3179880': [None, 67, 1], 'GSM3179881': [1, 67, 1], 'GSM3179882': [1, 65, 0], 'GSM3179883': [None, 66, 1], 'GSM3179884': [1, 66, 1], 'GSM3179885': [1, 71, 1], 'GSM3179886': [1, 69, 0], 'GSM3179887': [None, 72, 1], 'GSM3179888': [1, 