In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Mesothelioma/GSE112154'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling of diffuse malignant peritoneal mesothelioma"
!Series_summary	"Diffuse malignant peritoneal mesothelioma (DMPM) is a rapidly lethal malignancy. The comprehension of the molecular features of DMPM is of utmost importance for the fruitful management of the disease, especially in patients who fail standard treatments and have a poor prognosis due to the lack of effective alternative therapeutic options."
!Series_overall_design	"Gene expression profiling was carried out on a series of 45 frozen surgical specimens of diffuse malignant peritoneal mesothelioma (DMPM), 3 normal peritoneum samples and 2 patient-derived cell lines."
Sample Characteristics Dictionary:
{0: ['sample type: normal peritoneum', 'sample type: DMPM cell line', 'sample type: DMPM frozen tumor specimen']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine gene expression data availability
series_summary = "Diffuse malignant peritoneal mesothelioma (DMPM) is a rapidly lethal malignancy. The comprehension of the molecular features of DMPM is of utmost importance for the fruitful management of the disease, especially in patients who fail standard treatments and have a poor prognosis due to the lack of effective alternative therapeutic options."
if "gene expression profiling" in series_summary.lower():
    is_gene_available = True

# Define the sample characteristics dictionary
sample_char_dict = {0: ['sample type: normal peritoneum', 'sample type: DMPM cell line', 'sample type: DMPM frozen tumor specimen']}

# Variable Availability
# Mesothelioma trait
if any("DMPM" in val for val in sample_char_dict[0]):
    trait_row = 0

# Age
# No explicit information given for 'age' in the sample characteristics. Assuming data is not available.
age_row = None

# Gender
# No explicit information given for 'gender' in the sample characteristics. Assuming data is not available.
gender_row = None

# Data Type Conversion Functions

# Convert trait (Mesothelioma) to binary
def convert_trait(value):
    value = value.split(":")[1].strip()
    if value == 'DMPM frozen tumor specimen':
        return 1
    elif value == 'normal peritoneum':
        return 0
    return None

# No conversion functions needed for age and gender since they are not available.
convert_age = None
convert_gender = None

# Save Metadata
def save_cohort_info(series_id, file_path, is_gene_available, is_trait_available):
    cohort_info = {
        'series_id': series_id,
        'file_path': file_path,
        'is_gene_available': is_gene_available,
        'is_trait_available': is_trait_available
    }
    import json
    with open(file_path, 'w') as f:
        json.dump(cohort_info, f)

save_cohort_info('GSE112154', './preprocessed/Mesothelioma/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Mesothelioma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Mesothelioma/trait_data/GSE112154.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3058890': [0], 'GSM3058891': [0], 'GSM3058892': [0], 'GSM3058893': [None], 'GSM3058894': [None], 'GSM3058895': [1], 'GSM3058896': [1], 'GSM3058897': [1], 'GSM3058898': [1], 'GSM3058899': [1], 'GSM3058900': [1], 'GSM3058901': [1], 'GSM3058902': [1], 'GSM3058903': [1], 'GSM3058904': [1], 'GSM3058905': [1], 'GSM3058906': [1], 'GSM3058907': [1], 'GSM3058908': [1], 'GSM3058909': [1], 'GSM3058910': [1], 'GSM3058911': [1], 'GSM3058912': [1], 'GSM3058913': [1], 'GSM3058914': [1], 'GSM3058915': [1], 'GSM3058916': [1], 'GSM3058917': [1], 'GSM3058918': [1], 'GSM3058919': [1], 'GSM3058920': [1], 'GSM3058921': [1], 'GSM3058922': [1], 'GSM3058923': [1], 'GSM3058924': [1], 'GSM3058925': [1], 'GSM3058926': [1], 'GSM3058927': [1], 'GSM3058928': [1], 'GSM3058929': [1], 'GSM3058930': [1], 'GSM3058931': [1], 'GSM3058932': [1], 'GSM3058933': [1], 'GSM3058934': [1], 'GSM3058935': [1], 'GSM3058936': [1], 'GSM3058937': [1], 'GSM3058938': [1], 'GSM3058939': [1]}
