In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Lupus_(Systemic_Lupus_Erythematosus)/GSE184989'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomic profiling of DLE/SCLE/ACLE"
!Series_summary	"The microarray experiment was employed to evaluate the gene expressions in cutaneous lupus"
!Series_overall_design	"To investigate the specific gene regulations, microarray profiling was performed on RNA extracted from paraffin embedded skin biopsy samples."
Sample Characteristics Dictionary:
{0: ['disease state: DLE', 'disease state: SCLE', 'disease state: ACLE', 'disease state: NN']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = 0
age_row = None
gender_row = None

# Function to convert 'Lupus_(Systemic_Lupus_Erythematosus)' trait data
def convert_trait(value):
    if 'disease state:' in value:
        condition = value.split(': ')[1].strip()
        if condition in ['DLE', 'SCLE', 'ACLE']:
            return 1
        elif condition == 'NN':
            return 0
    return None

# Placeholder Functions for converting age and gender (returning None since these rows are not available)
def convert_age(value):
    return None

def convert_gender(value):
    return None

save_cohort_info('GSE184989', './preprocessed/Lupus_(Systemic_Lupus_Erythematosus)/cohort_info.json', is_gene_available, trait_row is not None)

# Placeholder DataFrame for clinical_data (this should ideally come from previous processing steps)
import pandas as pd  # Ensure pandas is imported for defining DataFrame
clinical_data = pd.DataFrame({
    0: ['disease state: DLE', 'disease state: SCLE', 'disease state: ACLE', 'disease state: NN']
})

selected_clinical_data = geo_select_clinical_features(clinical_data, 'Lupus_(Systemic_Lupus_Erythematosus)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Lupus_(Systemic_Lupus_Erythematosus)/trait_data/GSE184989.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{0: [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['16650001', '16650003', '16650005', '16650007', '16650009', '16650011',
       '16650013', '16650015', '16650017', '16650019', '16650021', '16650023',
       '16650025', '16650027', '16650029', '16650031', '16650033', '16650035',
       '16650037', '16650041'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'probeset_id': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['12190', '29554', '69091', '160446', '317811'], 'stop': ['13639', '31109', '70008', '161525', '328581'], 'total_probes': [25.0, 28.0, 8.0, 13.0, 36.0], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_051985 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_045117 // DDX11L10 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 10 // 16p13.3 // 100287029 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 2 // 2q13 // 84771 /// NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-A

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the identifier and gene symbol keys based on the outputs.
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'  # Correct key for gene symbols

# Preview to understand the structure - ensure you fetch the needed information
print(preview_df(gene_annotation[[identifier_key, gene_symbol_key]]))

# 2. Handle multiple entries in the 'gene_assignment' by splitting them correctly.
def extract_gene_symbol(gene_assignment):
    if isinstance(gene_assignment, str):
        genes = gene_assignment.split(' /// ')
        gene_names = [gene.split(' // ')[1].strip() for gene in genes if len(gene.split(' // ')) > 1]
        return gene_names[0] if gene_names else None
    return None

# Apply this to the gene_annotation dataframe
gene_annotation['Gene'] = gene_annotation[gene_symbol_key].apply(extract_gene_symbol)

# 3. Get the dataframe storing mapping between probe IDs and genes
gene_mapping_df = gene_annotation[[identifier_key, 'Gene']].dropna()

# 4. Apply the mapping with the 'apply_gene_mapping' function from the library.
gene_data = apply_gene_mapping(gene_data, gene_mapping_df)

# Print the first few records of the transformed gene_data for validation
print(preview_df(gene_data))


{'ID': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_051985 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_045117 // DDX11L10 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 10 // 16p13.3 // 100287029 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 2 // 2q13 // 84771 /// NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 2 // 2q13 // 84771 /// NR_051986 // DDX11L5 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 5 // 9p24.3 // 100287596 /// ENST00000456328 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// ENST00000559159 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 

{'GSM5602807': [2.651167909, 2.557976878, 1.527524754, 2.222399487, 7.467729133], 'GSM5602808': [2.782175382, 2.835643201, 1.293679938, 2.402838728, 7.248785911], 'GSM5602809': [2.877857382, 3.165805705, 1.291603426, 2.819038934, 7.654282535], 'GSM5602810': [2.676745232, 2.984814889, 1.307767098, 2.392320824, 6.784838293], 'GSM5602811': [3.526491876, 3.240367378, 1.210846386, 1.908703782, 5.913982722], 'GSM5602812': [2.977699259, 2.622101742, 1.302129109, 2.302300762, 6.955949881], 'GSM5602813': [3.373887213, 3.402225478, 1.403054971, 2.692832789, 7.06360902], 'GSM5602814': [2.878676982, 2.846799682, 1.263244386, 2.725154418, 7.02607754], 'GSM5602815': [3.184164234, 2.951227417, 1.211802162, 2.916368342, 7.602291596], 'GSM5602816': [3.304483371, 3.66741363, 1.498815523, 2.248885079, 7.419330476], 'GSM5602817': [2.618656219, 2.540407379, 1.267953842, 2.474235877, 7.33746571], 'GSM5602818': [3.150482121, 3.425183279, 1.014812199, 1.9375617, 6.733574349], 'GSM5602819': [3.084621486, 2.939

### Step 7: Data Normalization and Merging

In [8]:
# Inspect selected_clinical_data
print("Selected Clinical Data Preview:")
print(preview_df(selected_clinical_data))

# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Lupus_(Systemic_Lupus_Erythematosus)/gene_data/GSE184989.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# Inspect merged_data for initial check
print("Merged Data Preview:")
print(preview_df(merged_data))

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Lupus_(Systemic_Lupus_Erythematosus)')

# 4. Save the cohort information
save_cohort_info('GSE184989', './preprocessed/Lupus_(Systemic_Lupus_Erythematosus)/cohort_info.json', True, True, trait_biased, unbiased_merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Lupus_(Systemic_Lupus_Erythematosus)/GSE184989.csv'
    unbiased_merged_data.to_csv(csv_path)


Selected Clinical Data Preview:
{0: [1]}


Merged Data Preview:


{'Lupus_(Systemic_Lupus_Erythematosus)': [], 'A1BG': [], 'A1BG-AS1': [], 'A1CF': [], 'A2M': [], 'A2M-AS1': [], 'A2ML1': [], 'A2ML1-AS1': [], 'A2ML1-AS2': [], 'A2MP1': [], 'A4GALT': [], 'A4GNT': [], 'AA06': [], 'AAAS': [], 'AACS': [], 'AACSP1': [], 'AADAC': [], 'AADACL2': [], 'AADACL3': [], 'AADACL4': [], 'AADAT': [], 'AAGAB': [], 'AAK1': [], 'AAMP': [], 'AANAT': [], 'AARD': [], 'AARS2': [], 'AASDH': [], 'AASDHPPT': [], 'AASS': [], 'AATF': [], 'AATK': [], 'ABAT': [], 'ABCA1': [], 'ABCA10': [], 'ABCA12': [], 'ABCA13': [], 'ABCA17P': [], 'ABCA2': [], 'ABCA3': [], 'ABCA4': [], 'ABCA5': [], 'ABCA6': [], 'ABCA7': [], 'ABCA8': [], 'ABCA9': [], 'ABCA9-AS1': [], 'ABCB1': [], 'ABCB10': [], 'ABCB11': [], 'ABCB4': [], 'ABCB5': [], 'ABCB6': [], 'ABCB7': [], 'ABCB8': [], 'ABCB9': [], 'ABCC1': [], 'ABCC10': [], 'ABCC11': [], 'ABCC12': [], 'ABCC13': [], 'ABCC2': [], 'ABCC3': [], 'ABCC4': [], 'ABCC5': [], 'ABCC5-AS1': [], 'ABCC6': [], 'ABCC6P1': [], 'ABCC6P2': [], 'ABCC8': [], 'ABCC9': [], 'ABCD1': [],