In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Chronic_kidney_disease/GSE127136'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Single-cell RNA-seq profiling reveals novel insights in immune-complex deposition and epithelium transition in IgA nephropathy"
!Series_summary	"IgA nephropathy represents the most prevalent chronic nephrosis worldwide. However, pathogenesis about IgA deposition and end-stage renal failure is still not well defined. Using single-cell RNA-seq, we identified the mesangial membrane receptor for IgA, which collaborates with increased extracellular matrix proteins and protease inhibitor to facilitate IgA deposition. Meanwhile, cell-cell interaction analysis revealed increased communications between mesangium and other cell types, uncovering how morbidity inside glomerulus spreads to whole kidney, which results in the genetic changes of kidney resident immune cells. Prominent interaction decreasing in intercalated cells leads to the discovery of a transitional cell type, which exhibited significant EMT and fibrosis features. Our work comprehensively cha

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking if the dataset contains gene expression data
# Assuming the given data is gene expression data based on single-cell RNA-seq information in the Series summary.
is_gene_available = True

# Variable Availability and Data Type Conversion
# Chronic_kidney_disease
sample_characteristics = {0: ['patients: IgAN_01', 'patients: IgAN_06', 'patients: IgAN_07', 'patients: IgAN_09', 'patients: IgAN_10', 'patients: IgAN_11', 'patients: IgAN_12', 'patients: IgAN_15', 'patients: IgAN_16', 'patients: IgAN_17', 'patients: IgAN_18', 'patients: IgAN_19', 'patients: IgAN_20', 'patients: NM_01', 'patients: NM_02', 'patients: NM_03', 'patients: NM_07', 'patients: NM_08', 'patients: NM_09', 'patients: PBM_IgAN_10', 'patients: PBM_IgAN_12', 'patients: PBM_IgAN_17', 'patients: PBM_IgAN_19', 'patients: PBM_NM_01', 'patients: PBM_NM_02', 'patients: PBM_NM_03', 'patients: PBM_NM_04', 'patients: PBM_NM_05'], 
                       1: ['disease state: IgAN', 'disease state: kidney cancer', 'disease state: normal'], 
                       2: ['tissue: renal biopsies', 'tissue: paracancerous tissues', 'cell type: monocytes']}

if 1 in sample_characteristics:
    trait_row = 1

# Age and gender data are not available
age_row = None
gender_row = None

# Data type conversion functions
def convert_trait(value):
    if not value:
        return None
    value = value.split(':')[-1].strip().lower()
    if value == "igan":
        return 1
    elif value in ["kidney cancer", "normal"]:
        return 0
    else:
        return None

convert_age = None  # Since age data is not available
convert_gender = None  # Since gender data is not available

# Save cohort information
save_cohort_info('GSE127136', './preprocessed/Chronic_kidney_disease/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Chronic_kidney_disease', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Chronic_kidney_disease/trait_data/GSE127136.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3625775': [1], 'GSM3625776': [1], 'GSM3625777': [1], 'GSM3625778': [1], 'GSM3625779': [1], 'GSM3625780': [1], 'GSM3625781': [1], 'GSM3625782': [1], 'GSM3625783': [1], 'GSM3625784': [1], 'GSM3625785': [1], 'GSM3625786': [1], 'GSM3625787': [1], 'GSM3625788': [1], 'GSM3625789': [1], 'GSM3625790': [1], 'GSM3625791': [1], 'GSM3625792': [1], 'GSM3625793': [1], 'GSM3625794': [1], 'GSM3625795': [1], 'GSM3625796': [1], 'GSM3625797': [1], 'GSM3625798': [1], 'GSM3625799': [1], 'GSM3625800': [1], 'GSM3625801': [1], 'GSM3625802': [1], 'GSM3625803': [1], 'GSM3625804': [1], 'GSM3625805': [1], 'GSM3625806': [1], 'GSM3625807': [1], 'GSM3625808': [1], 'GSM3625809': [1], 'GSM3625810': [1], 'GSM3625811': [1], 'GSM3625812': [1], 'GSM3625813': [1], 'GSM3625814': [1], 'GSM3625815': [1], 'GSM3625816': [1], 'GSM3625817': [1], 'GSM3625818': [1], 'GSM3625819': [1], 'GSM3625820': [1], 'GSM3625821': [1], 'GSM3625822': [1], 'GSM3625823': [1], 'GSM3625824': [1], 'GSM3625825': [1], 'GSM3625826': [1], 'GSM3625827

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])
    

# Given the output from STEP3 is an empty index, we cannot conclusively determine
# if the identifiers are human gene symbols. As a default assumption in the absence 
# of specific identifiers, we assume that they require gene mapping.


Index([], dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


from utils.preprocess import *

# Function to read a few lines from a gzipped file for debugging
def read_gzipped_file(file_path, num_lines=10):
    import gzip
    with gzip.open(file_path, 'rt') as f:
        lines = [f.readline().strip() for _ in range(num_lines)]
    return lines


### Step 5: Gene Annotation (Conditional)

In [6]:
try:
    # 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file
    gene_annotation = get_gene_annotation(soft_file)
    
    # 2. Use the 'preview_df' function from the library to preview the data and print out the results.
    print("Gene annotation preview:")
    print(preview_df(gene_annotation))
except Exception as e:
    print(f"An error occurred: {e}")
    print("First few lines of the file for debugging:")
    print(read_gzipped_file(soft_file, num_lines=10))


An error occurred: No columns to parse from file
First few lines of the file for debugging:
['^DATABASE = GeoMiame', '!Database_name = Gene Expression Omnibus (GEO)', '!Database_institute = NCBI NLM NIH', '!Database_web_link = http://www.ncbi.nlm.nih.gov/geo', '!Database_email = geo@ncbi.nlm.nih.gov', '^SERIES = GSE127136', '!Series_title = Single-cell RNA-seq profiling reveals novel insights in immune-complex deposition and epithelium transition in IgA nephropathy', '!Series_geo_accession = GSE127136', '!Series_status = Public on Jul 06 2021', '!Series_submission_date = Feb 25 2019']


### Step 6: Gene Identifier Mapping

In [7]:
if requires_gene_mapping:
    try:
        # STEP5: Obtain gene annotation data
        gene_annotation = get_gene_annotation(soft_file)
        print("Gene annotation preview:")
        print(preview_df(gene_annotation))

        # 1. Determine the appropriate keys for identifiers and gene symbols
        identifier_key = 'ID'
        gene_symbol_key = 'Gene Symbol'

        # 2. Get the dataframe storing the mapping between probe IDs and genes
        gene_mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
        
        # 3. Apply the mapping and obtain the resulting gene expression dataframe
        gene_data = apply_gene_mapping(gene_data, gene_mapping_df)
    except Exception as e:
        print(f"An error occurred: {e}")
        # Debugging information
        print("First few lines of the soft file for debugging:")
        print(read_gzipped_file(soft_file, num_lines=10))


An error occurred: No columns to parse from file
First few lines of the soft file for debugging:
['^DATABASE = GeoMiame', '!Database_name = Gene Expression Omnibus (GEO)', '!Database_institute = NCBI NLM NIH', '!Database_web_link = http://www.ncbi.nlm.nih.gov/geo', '!Database_email = geo@ncbi.nlm.nih.gov', '^SERIES = GSE127136', '!Series_title = Single-cell RNA-seq profiling reveals novel insights in immune-complex deposition and epithelium transition in IgA nephropathy', '!Series_geo_accession = GSE127136', '!Series_status = Public on Jul 06 2021', '!Series_submission_date = Feb 25 2019']


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Chronic_kidney_disease/gene_data/GSE127136.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Chronic_kidney_disease')

# 4. Save the cohort information.
save_cohort_info('GSE127136', './preprocessed/Chronic_kidney_disease/cohort_info.json', True, True, trait_biased, merged_data)

# If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Chronic_kidney_disease/GSE127136.csv'
    unbiased_merged_data.to_csv(csv_path)


No gene data in the dataframe
