In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Psoriatic_Arthritis/GSE61281'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Human Whole Blood: Psoriatic Arthritis [PsA] vs. Cutaneous Psoriasis Without Arthritis [PsC] vs. Controls"
!Series_summary	"Transcriptional profiling of human whole blood comparing PsA, PsC, and unaffected controls"
!Series_overall_design	"Three condition experiment: PsA, PsC, unaffected controls. Biological replicates: 20 PsA, 20 PsC, 12 controls"
Sample Characteristics Dictionary:
{0: ['tissue: whole blood'], 1: ['condition: Psoriatic arthritis', 'condition: Cutaneous psoriasis without arthritis', 'condition: Unaffected control'], 2: ['gender: Female', 'gender: Male'], 3: ['batch: 4', 'batch: 3', 'batch: 2', 'batch: 1'], 4: ['psoriasis duration: 48.0', 'psoriasis duration: 37.0', 'psoriasis duration: 22.0', 'psoriasis duration: 13.0', 'psoriasis duration: 33.0', 'psoriasis duration: 18.0', 'psoriasis duration: 38.0', 'psoriasis duration: 24.0', 'psoriasis duration: 19.0', 'psoriasis duration: 28.0', 'psoriasis duration: 20.0', 'psoriasis duratio

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = 1
age_row = 5
gender_row = 2

def convert_trait(value):
    # The conditions given are 'Psoriatic arthritis', 'Cutaneous psoriasis without arthritis', 'Unaffected control'
    val = value.split(': ')[1].strip().lower()
    if val == 'psoriatic arthritis':
        return 1
    elif val == 'cutaneous psoriasis without arthritis' or val == 'unaffected control':
        return 0
    return None

def convert_age(value):
    # Use the age of psoriasis onset as a proxy for age
    val = value.split(': ')[1].strip().lower()
    if val.isdigit():
        return int(val)
    return None

def convert_gender(value):
    val = value.split(': ')[1].strip().lower()
    if val == 'female':
        return 0
    elif val == 'male':
        return 1
    return None

save_cohort_info('GSE61281', './preprocessed/Psoriatic_Arthritis/cohort_info.json', is_gene_available, trait_row is not None)

selected_clinical_data = geo_select_clinical_features(clinical_data, 'Psoriatic_Arthritis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Psoriatic_Arthritis/trait_data/GSE61281.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM1501512': [1, 19, 0], 'GSM1501513': [1, 11, 0], 'GSM1501514': [1, 23, 1], 'GSM1501515': [1, 31, 1], 'GSM1501516': [1, 26, 1], 'GSM1501517': [1, 29, 1], 'GSM1501518': [1, 7, 1], 'GSM1501519': [1, 31, 1], 'GSM1501520': [1, 30, 1], 'GSM1501521': [1, 17, 1], 'GSM1501522': [1, 13, 0], 'GSM1501523': [1, 19, 0], 'GSM1501524': [1, 11, 0], 'GSM1501525': [1, 69, 0], 'GSM1501526': [1, 32, 0], 'GSM1501527': [1, 24, 0], 'GSM1501528': [1, 41, 0], 'GSM1501529': [1, 25, 1], 'GSM1501530': [1, 23, 0], 'GSM1501531': [1, 24, 0], 'GSM1501532': [0, 18, 1], 'GSM1501533': [0, 21, 1], 'GSM1501534': [0, 26, 1], 'GSM1501535': [0, 39, 1], 'GSM1501536': [0, 38, 0], 'GSM1501537': [0, 37, 1], 'GSM1501538': [0, 20, 1], 'GSM1501539': [0, 8, 1], 'GSM1501540': [0, 30, 1], 'GSM1501541': [0, 25, 1], 'GSM1501542': [0, 8, 0], 'GSM1501543': [0, 30, 1], 'GSM1501544': [0, 47, 0], 'GSM1501545': [0, 25, 0], 'GSM1501546': [0, 7, 0], 'GSM1501547': [0, 33, 0], 'GSM1501548': [0, 16, 0], 'GSM1501549': [0, 15, 0], 'GSM1501550': [

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['(+)E1A_r60_1', '(+)E1A_r60_3', '(+)E1A_r60_a104', '(+)E1A_r60_a107',
       '(+)E1A_r60_a135', '(+)E1A_r60_a20', '(+)E1A_r60_a22', '(+)E1A_r60_a97',
       '(+)E1A_r60_n11', '(+)E1A_r60_n9', '(+)eQC-39', '(+)eQC-40',
       '(+)eQC-41', '(+)eQC-42', '(-)3xSLv1', 'A_23_P100001', 'A_23_P100011',
       'A_23_P100022', 'A_23_P100056', 'A_23_P100074'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['A_23_P100001', 'A_23_P100011', 'A_23_P100022', 'A_23_P100056', 'A_23_P100074'], 'SPOT_ID': ['A_23_P100001', 'A_23_P100011', 'A_23_P100022', 'A_23_P100056', 'A_23_P100074'], 'CONTROL_TYPE': ['FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE'], 'REFSEQ': ['NM_207446', 'NM_005829', 'NM_014848', 'NM_194272', 'NM_020371'], 'GB_ACC': ['NM_207446', 'NM_005829', 'NM_014848', 'NM_194272', 'NM_020371'], 'GENE': [400451.0, 10239.0, 9899.0, 348093.0, 57099.0], 'GENE_SYMBOL': ['FAM174B', 'AP3S2', 'SV2B', 'RBPMS2', 'AVEN'], 'GENE_NAME': ['family with sequence similarity 174, member B', 'adaptor-related protein complex 3, sigma 2 subunit', 'synaptic vesicle glycoprotein 2B', 'RNA binding protein with multiple splicing 2', 'apoptosis, caspase activation inhibitor'], 'UNIGENE_ID': ['Hs.27373', 'Hs.632161', 'Hs.21754', 'Hs.436518', 'Hs.555966'], 'ENSEMBL_ID': ['ENST00000557398', nan, 'ENST00000557410', 'ENST00000300069', 'ENST00000306730'], 'TIGR_ID': [nan, nan, nan, nan, nan]

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Set identifier_key and gene_symbol_key based on the dictionary from STEP5
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping and get the resulting gene expression dataframe
gene_data = apply_gene_mapping(gene_data, gene_mapping)

print(preview_df(gene_data))


{'GSM1501512': [1.269359674, 2.376352078, 0.370473158, -2.317282132, 1.520617362], 'GSM1501513': [0.874468831, 0.5559684970000001, 0.074938808, -1.9751781115, 0.907585678], 'GSM1501514': [1.385670799, 1.509833248, -0.167974952, -3.632802177, 1.44359391], 'GSM1501515': [4.283748375, 1.4808397535000002, 0.438281323, -4.6675092165, -0.831596675], 'GSM1501516': [2.912166361, 2.5590398629999997, 0.646473474, -3.1071575185, 0.74488581], 'GSM1501517': [1.642720016, 2.86788172, 0.996309187, -1.6395385574999999, 0.106603851], 'GSM1501518': [1.608309492, 2.416803509, -0.270248054, -3.063985959, 1.703823795], 'GSM1501519': [1.549035356, 1.4886526875000001, 0.878870966, -2.4570891885, 0.108064714], 'GSM1501520': [1.131208168, 0.842587145, 0.041479818, -1.4147756705, 1.377474878], 'GSM1501521': [3.049450184, 3.4551762985, 0.660960738, -3.2187859669999996, 0.482710758], 'GSM1501522': [0.213184907, 1.9485496625, 0.72738691, -3.2518104364999996, 3.025374262], 'GSM1501523': [1.581508099, 2.1765275455, 

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Psoriatic_Arthritis/gene_data/GSE61281.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Psoriatic_Arthritis')

# 4. Save the cohort information.
save_cohort_info('GSE61281', './preprocessed/Psoriatic_Arthritis/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Psoriatic_Arthritis/GSE61281.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Psoriatic_Arthritis', the least common label is '1.0' with 20 occurrences. This represents 50.00% of the dataset.
The distribution of the feature 'Psoriatic_Arthritis' in this dataset is fine.

Quartiles for 'Age':
  25%: 17.75
  50% (Median): 24.5
  75%: 30.25
Min: 7.0
Max: 69.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 19 occurrences. This represents 47.50% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

