In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Eczema/GSE6281'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression time-course in the human skin during elicitation of allergic contact dermatitis"
!Series_summary	"Genes involved in the inflammatory response resulting in allergic contact dermatitis (ACD) are only partly known. In this study, we introduce the use of high density oligonucleotide arrays for gene expression profiling in human skin during the elicitation of ACD. Skin biopsies from normal and nickel-exposed skin were obtained from 7 nickel-allergic patients and 5 non-allergic controls at four different time points during elicitation of eczema: 0h, 7h, 48h and 96h. Each gene expression profile was analysed by hybridization to high density oligonucletide arrays."
!Series_summary	"Cluster analysis of 74 genes found to be differentially expressed in the patients over time revealed that the patient samples may be categorised into two groups: An early time point group (0h and 7h) and a late time point group (48h and 96h). Compared to the ear

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability based on the background information
is_gene_available = True

# Analyze the Sample Characteristics Dictionary for variable availability
sample_characteristics_dict = {
    0: [
        'Female (age range 33-49), skin biopsy from upper nates taken 7 hours after nickel exposure', 
        'Female (age range 33-49), skin biopsy from upper nates taken 48 hours after nickel exposure', 
        'Female (age range 33-49), skin biopsy taken from upper nates 96 hours after nickel exposure', 
        'Female (age range 33-49), skin biopsy upper nates. No nickel exposure', 
        'Female (age range 33-49), skin biopsy taken from upper nates 7 hours after nickel exposure', 
        'Female (age range 33-49), skin biopsy taken from upper nates 48 hours after nickel exposure', 
        'Female (age range 33-49), skin biopsy taken from upper nates no nickel exposure', 
        'Female (age range 33-49), skin biopsy from upper nates. No nickel exposure', 
        'Nickel allergic female (age range 33-49), skin biopsy from upper nates taken 48 hours after nickel exposure', 
        'Nickel allergic female (age range 33-49), skin biopsy from upper nates taken 96 hours after nickel exposure', 
        'Nickel allergic female (age range 33-49), skin biopsy from upper nates. No nickel exposure', 
        'Nickel allergic female (age range 33-49), skin biopsy from upper nates taken 7 hours after nickel exposure', 
        'Nickel allergic female (age range 33-49), skin biopsy from upper nates. No nickel exposure.'
    ]
}

# Checking availability for 'Eczema'
if any('eczema' in val.lower() for val in sample_characteristics_dict[0]):
    trait_row = 0

# Checking availability for 'age'
if any('age range' in val.lower() for val in sample_characteristics_dict[0]):
    age_row = 0

# Checking availability for 'gender'
if any('female' in val.lower() for val in sample_characteristics_dict[0]) or any('male' in val.lower() for val in sample_characteristics_dict[0]):
    gender_row = 0

# Defining conversion functions
def convert_trait(value):
    val = value.split(':')[-1].lower()
    if 'eczema' in val:
        return 1
    return 0  # assuming samples without mention of 'eczema' are non-eczema

def convert_age(value):
    val = value.split(':')[-1]
    # Assuming age range 33-49 only, conversion can be adjusted if exact age is given:
    if '33' in val and '49' in val:
        return 41  # A simple average of the range; adjust if more precise data is available
    return None

def convert_gender(value):
    val = value.split(':')[-1].lower()
    if 'female' in val:
        return 0
    elif 'male' in val:
        return 1
    return None

# Save cohort information
save_cohort_info('GSE6281', './preprocessed/Eczema/cohort_info.json', is_gene_available, trait_row is not None)

# If clinical data is available, perform clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Eczema', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Eczema/trait_data/GSE6281.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
