In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Psoriasis/GSE183134'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomic profiling of Pityriasis Rubra Pilaris (PRP) and Psoriasis"
!Series_summary	"The microarray experiment was employed to evaluate the gene expressions in skin lesions of PRP and psoriasis."
!Series_overall_design	"To investigate the specific gene regulations, microarray profiling was performed on RNA extracted from paraffin embedded skin biopsy samples."
Sample Characteristics Dictionary:
{0: ['tissue: Skin'], 1: ['disease state: Pityriasis_Rubra_Pilaris', 'disease state: Psoriasis']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Gene Expression Data Availability
is_gene_available = True

# Variable Availability and Data Type Conversion
# Finding the key for Psoriasis in Sample Characteristics Dictionary
sample_characteristics_dict = {0: ['tissue: Skin'], 1: ['disease state: Pityriasis_Rubra_Pilaris', 'disease state: Psoriasis']}

# Check for Psoriasis data availability
for key, values in sample_characteristics_dict.items():
    unique_diseases = set()
    for value in values:
        if 'disease state' in value:
            unique_diseases.add(value.split(": ")[1])
    if len(unique_diseases) > 1 and 'Psoriasis' in unique_diseases:
        trait_row = key
        break

# Since there are no keys for 'age' and 'gender', these fields are not available
age_row = None
gender_row = None

# Defining the conversion functions
def convert_trait(value):
    value = value.split(": ")[1] if ": " in value else None
    if value == 'Psoriasis':
        return 1
    elif value == 'Pityriasis_Rubra_Pilaris':
        return 0
    return None
  
# `convert_age` and `convert_gender` are not applicable as `age_row` and `gender_row` are None
convert_age = lambda value: None
convert_gender = lambda value: None

def save_cohort_info(series_id, filepath, gene_data_available, clinical_data_available):
    import json
    cohort_info = {
        'series_id': series_id,
        'filepath': filepath,
        'gene_data_available': gene_data_available,
        'clinical_data_available': clinical_data_available
    }
    with open(filepath, 'w') as file:
        json.dump(cohort_info, file)

save_cohort_info('GSE183134', './preprocessed/Psoriasis/cohort_info.json', is_gene_available, trait_row is not None)

from utils.preprocess import geo_select_clinical_features
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Psoriasis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Psoriasis/trait_data/GSE183134.csv'
    selected_clinical_data.to_csv(csv_path)
    from utils.preprocess import preview_df
    print(preview_df(selected_clinical_data))


{'GSM5551681': [0], 'GSM5551682': [0], 'GSM5551683': [0], 'GSM5551684': [0], 'GSM5551685': [0], 'GSM5551686': [0], 'GSM5551687': [0], 'GSM5551688': [0], 'GSM5551689': [0], 'GSM5551690': [0], 'GSM5551691': [0], 'GSM5551692': [0], 'GSM5551693': [0], 'GSM5551694': [1], 'GSM5551695': [1], 'GSM5551696': [1], 'GSM5551697': [1], 'GSM5551698': [1], 'GSM5551699': [1], 'GSM5551700': [1], 'GSM5551701': [1], 'GSM5551702': [1], 'GSM5551703': [1], 'GSM5551704': [1], 'GSM5551705': [1], 'GSM5551706': [1], 'GSM5551707': [1], 'GSM5551708': [1], 'GSM5551709': [1], 'GSM5551710': [1], 'GSM5551711': [1], 'GSM5551712': [1], 'GSM5551713': [1], 'GSM5551714': [1], 'GSM5551715': [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1-Dec', '1-Sep', '10-Mar', '10-Sep', '11-Mar', '11-Sep', '12-Sep',
       '14-Sep', '15-Sep', '2-Sep', '3-Mar', '3-Sep', '4-Mar', '4-Sep',
       '5-Mar', '6-Mar', '6-Sep', '7-Mar', '7-Sep', '8-Mar'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['DDX11L1', 'MIR1302-2', 'OR4F5', 'LOC100132287', 'LOC105379690'], 'SPOT_ID': ['DDX11L1', 'MIR1302-2', 'OR4F5', 'LOC100132287', 'LOC105379690']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Based on the previews and output from previous steps
identifier_key = 'ID'
gene_symbol_key = 'SPOT_ID'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to the gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)
