### Step 1: Initial Data Loading

In [1]:
import time
from tools.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Epilepsy/GSE29796'  # Breast_Cancer 207847,208101,270721,283522; Epilepsy/GSE29796
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptional Differences between Normal and Glioma-Derived Glial Progenitor Cells Identify a Core Set of Dysregulated Genes."
!Series_summary	"Glial progenitor cells (GPCs) of the adult human white matter, which express gangliosides recognized by monoclonal antibody A2B5, are a potential source of glial tumors of the brain. We used A2B5-based sorting to extract progenitor-like cells from a range of human glial tumors, that included low-grade glioma, oligodendroglioma, oligo-astrocytomas, anaplastic astrocytoma, and glioblastoma multiforme. The A2B5+ tumor cells proved tumorigenic upon orthotopic xenograft, and the tumors generated reflected the phenotypes of those from which they derived."
!Series_summary	"Expression profiling revealed that A2B5+ tumor progenitors expressed a cohort of genes by which they could be distinguished from A2B5+ GPCs isolated from normal adult white matter. Most of the genes differentially expressed by glioma-derived 

### Step 3: Gene Data Extraction

In [2]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])

Index(['1007_s_at', '1053_at', '117_at', '1294_at', '1405_i_at',
       '1552256_a_at', '1552257_a_at', '1552258_at', '1552263_at',
       '1552264_a_at', '1552266_at', '1552274_at', '1552275_s_at',
       '1552277_a_at', '1552281_at', '1552283_s_at', '1552286_at',
       '1552287_s_at', '1552291_at', '1552302_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [3]:
requires_gene_mapping = True

### Step 5: Gene Annotation (Conditional)

In [4]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))
    
# Since requires_gene_mapping is False, we will skip STEP6 and proceed to the next steps.

Gene annotation preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X69699 /FEATURE= /DEF

In [5]:
gene_annotation

Unnamed: 0,ID,GB_ACC,SPOT_ID,Species Scientific Name,Annotation Date,Sequence Type,Sequence Source,Target Description,Representative Public ID,Gene Title,Gene Symbol,ENTREZ_GENE_ID,RefSeq Transcript ID,Gene Ontology Biological Process,Gene Ontology Cellular Component,Gene Ontology Molecular Function
0,1007_s_at,U48705,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Huma...,U48705,discoidin domain receptor tyrosine kinase 1 //...,DDR1 /// MIR4640,780 /// 100616237,NM_001202521 /// NM_001202522 /// NM_001202523...,0001558 // regulation of cell growth // inferr...,0005576 // extracellular region // inferred fr...,0000166 // nucleotide binding // inferred from...
1,1053_at,M87338,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,M87338 /FEATURE= /DEFINITION=HUMA1SBU Human re...,M87338,"replication factor C (activator 1) 2, 40kDa",RFC2,5982,NM_001278791 /// NM_001278792 /// NM_001278793...,0000278 // mitotic cell cycle // traceable aut...,0005634 // nucleus // inferred from electronic...,0000166 // nucleotide binding // inferred from...
2,117_at,X51757,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,X51757 /FEATURE=cds /DEFINITION=HSP70B Human h...,X51757,heat shock 70kDa protein 6 (HSP70B'),HSPA6,3310,NM_002155,0000902 // cell morphogenesis // inferred from...,0005737 // cytoplasm // inferred from direct a...,0000166 // nucleotide binding // inferred from...
3,121_at,X69699,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,GenBank,X69699 /FEATURE= /DEFINITION=HSPAX8A H.sapiens...,X69699,paired box 8,PAX8,7849,NM_003466 /// NM_013951 /// NM_013952 /// NM_0...,0001655 // urogenital system development // in...,0005634 // nucleus // inferred from direct ass...,0000979 // RNA polymerase II core promoter seq...
4,1255_g_at,L36861,,Homo sapiens,"Oct 6, 2014",Exemplar sequence,Affymetrix Proprietary Database,L36861 /FEATURE=expanded_cds /DEFINITION=HUMGC...,L36861,guanylate cyclase activator 1A (retina),GUCA1A,2978,NM_000409 /// XM_006715073,0007165 // signal transduction // non-traceabl...,0001750 // photoreceptor outer segment // infe...,0005509 // calcium ion binding // inferred fro...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029998,AFFX-r2-Ec-bioB-M_at,10.51109513,,,,,,,,,,,,,,
2029999,AFFX-r2-Ec-bioC-3_at,12.14028992,,,,,,,,,,,,,,
2030000,AFFX-r2-Ec-bioC-5_at,12.59745154,,,,,,,,,,,,,,
2030001,AFFX-r2-Ec-bioD-5_at,13.16992425,,,,,,,,,,,,,,


In [6]:
# Let's try using additional columns or another source as identifier_key

# Possible keys to check: 'ID', 'Name', or others seen in preview
identifier_key = 'ID'
gene_symbol_key = 'Gene Symbol'  #'SPOT_ID.1'

print("Available Columns in Gene Annotation:")
print(gene_annotation.columns)


# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping_df = get_gene_mapping(gene_annotation, prob_col=identifier_key, gene_col=gene_symbol_key)

# Validate the mapping dataframe content
print("Gene Mapping DataFrame Preview:")
print(preview_df(gene_mapping_df))

# 3. Apply the mapping and name the resulting gene expression dataframe "gene_data"
gene_data_mapped = apply_gene_mapping(gene_data, gene_mapping_df)

# Verify the resulting gene_data preview for any issues
print("Mapped Gene Data Preview:")
print(preview_df(gene_data_mapped))

# If the mapping is successful, reassign gene_data
if not gene_data_mapped.empty:
    gene_data = gene_data_mapped
else:
    print(f"Mapping with {identifier_key} resulted in an empty dataframe!")

# Final validation
print("Final Mapped Gene Data Preview:")
print(preview_df(gene_data))

Available Columns in Gene Annotation:
Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')
Gene Mapping DataFrame Preview:
{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'Gene': ['DDR1 /// MIR4640', 'RFC2', 'HSPA6', 'PAX8', 'GUCA1A']}
Mapped Gene Data Preview:
{'GSM738329': [9.518956088, 5.217718693, 6.188801624, 6.216575608, 6.792088737], 'GSM738330': [8.46014003, 5.637911473, 7.387368425, 6.109275066, 5.77777689], 'GSM738331': [9.631526576, 4.637884334, 5.729614642, 5.782923318, 5.2537525], 'GSM738332': [8.104231341, 4.369238157, 7.729515888, 6.1591267, 6.409544019], 'GSM738333': [11.0553514, 6.326042102, 7.29647706, 5.96

In [7]:
gene_data

Unnamed: 0_level_0,GSM738329,GSM738330,GSM738331,GSM738332,GSM738333,GSM738334,GSM738335,GSM738336,GSM738337,GSM738338,...,GSM1053441,GSM1053442,GSM1053443,GSM1053444,GSM1053445,GSM1053446,GSM1053447,GSM1053448,GSM1053449,GSM1053450
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,9.518956,8.460140,9.631527,8.104231,11.055351,10.696805,10.781938,11.119107,10.767300,9.192159,...,11.920313,9.498640,12.668290,10.038477,8.878315,10.390422,11.493836,11.667221,10.023463,9.765471
A2M-AS1,5.217719,5.637911,4.637884,4.369238,6.326042,5.988006,5.725365,6.241771,5.587675,4.530469,...,6.444191,4.552161,7.310309,4.909238,4.561397,4.745399,5.751153,5.769769,4.908284,4.671369
A2MP1,6.188802,7.387368,5.729615,7.729516,7.296477,7.883727,7.214071,6.712863,7.739035,5.980618,...,6.467368,5.822306,6.886734,5.763278,5.896677,6.133759,6.868864,7.012643,6.780162,6.398146
AACS,6.216576,6.109275,5.782923,6.159127,5.962652,6.502743,5.960718,6.150082,7.406002,8.647645,...,7.442819,7.129964,6.231053,6.826430,6.218534,6.770478,7.024054,7.638544,7.248299,7.187378
AADAT,6.792089,5.777777,5.253753,6.409544,5.066335,5.742035,4.885224,5.735620,6.969634,7.871836,...,7.352346,5.610672,5.842498,5.697246,6.284140,5.189023,7.042557,6.612581,7.344388,6.806149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,26.435034,26.393904,24.814271,24.836749,24.900537,25.102935,24.946105,26.716375,31.737905,33.288914,...,28.910529,28.631292,29.815066,29.342836,29.223013,28.695542,30.437338,29.561462,30.434940,30.099929
ZYG11B,20.446362,19.787212,20.347999,19.832259,18.811543,18.141120,18.470585,19.121759,25.893403,26.412181,...,27.188745,23.425570,26.241318,25.430921,26.332768,24.256948,26.099550,25.136948,27.468579,26.429893
ZYX,13.506030,14.127335,14.121341,13.042665,13.792625,13.982534,14.010373,14.117730,17.371953,17.497411,...,15.411325,15.481772,15.008652,17.497584,16.119934,14.033331,15.939327,16.480252,16.551470,16.348559
ZZEF1,18.378585,17.808635,17.477718,17.581637,18.676126,18.112217,18.942429,19.496745,23.189925,26.314164,...,20.408743,20.451127,20.796645,21.304203,20.973958,22.011637,22.520715,23.294418,21.438836,21.201364


### Step 7: Data Normalization and Merging

In [8]:


total_start_time = time.time()
# 1. Normalize the obtained gene data
start_time = time.time()
normalized_gene_data = normalize_gene_symbols_in_index2(gene_data)
normalization_time = time.time() - start_time
print(f"Gene normalization took: {normalization_time:.2f} seconds")


Gene normalization took: 43.65 seconds


In [9]:
normalized_gene_data

Unnamed: 0,GSM738329,GSM738330,GSM738331,GSM738332,GSM738333,GSM738334,GSM738335,GSM738336,GSM738337,GSM738338,...,GSM1053441,GSM1053442,GSM1053443,GSM1053444,GSM1053445,GSM1053446,GSM1053447,GSM1053448,GSM1053449,GSM1053450
A2M,9.518956,8.460140,9.631527,8.104231,11.055351,10.696805,10.781938,11.119107,10.767300,9.192159,...,11.920313,9.498640,12.668290,10.038477,8.878315,10.390422,11.493836,11.667221,10.023463,9.765471
A2M-AS1,5.217719,5.637911,4.637884,4.369238,6.326042,5.988006,5.725365,6.241771,5.587675,4.530469,...,6.444191,4.552161,7.310309,4.909238,4.561397,4.745399,5.751153,5.769769,4.908284,4.671369
A2MP1,6.188802,7.387368,5.729615,7.729516,7.296477,7.883727,7.214071,6.712863,7.739035,5.980618,...,6.467368,5.822306,6.886734,5.763278,5.896677,6.133759,6.868864,7.012643,6.780162,6.398146
AACS,6.216576,6.109275,5.782923,6.159127,5.962652,6.502743,5.960718,6.150082,7.406002,8.647645,...,7.442819,7.129964,6.231053,6.826430,6.218534,6.770478,7.024054,7.638544,7.248299,7.187378
AADAT,6.792089,5.777777,5.253753,6.409544,5.066335,5.742035,4.885224,5.735620,6.969634,7.871836,...,7.352346,5.610672,5.842498,5.697246,6.284140,5.189023,7.042557,6.612581,7.344388,6.806149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,26.435034,26.393904,24.814271,24.836749,24.900537,25.102935,24.946105,26.716375,31.737905,33.288914,...,28.910529,28.631292,29.815066,29.342836,29.223013,28.695542,30.437338,29.561462,30.434940,30.099929
ZYG11B,20.446362,19.787212,20.347999,19.832259,18.811543,18.141120,18.470585,19.121759,25.893403,26.412181,...,27.188745,23.425570,26.241318,25.430921,26.332768,24.256948,26.099550,25.136948,27.468579,26.429893
ZYX,13.506030,14.127335,14.121341,13.042665,13.792625,13.982534,14.010373,14.117730,17.371953,17.497411,...,15.411325,15.481772,15.008652,17.497584,16.119934,14.033331,15.939327,16.480252,16.551470,16.348559
ZZEF1,18.378585,17.808635,17.477718,17.581637,18.676126,18.112217,18.942429,19.496745,23.189925,26.314164,...,20.408743,20.451127,20.796645,21.304203,20.973958,22.011637,22.520715,23.294418,21.438836,21.201364


In [10]:
total_start_time = time.time()
# 1. Normalize the obtained gene data
start_time = time.time()
normalized_gene_data2 = normalize_gene_symbols_in_index(gene_data)
normalization_time = time.time() - start_time
print(f"Gene normalization took: {normalization_time:.2f} seconds")

Gene normalization took: 0.09 seconds


In [11]:
normalized_gene_data2

Unnamed: 0,GSM738329,GSM738330,GSM738331,GSM738332,GSM738333,GSM738334,GSM738335,GSM738336,GSM738337,GSM738338,...,GSM1053441,GSM1053442,GSM1053443,GSM1053444,GSM1053445,GSM1053446,GSM1053447,GSM1053448,GSM1053449,GSM1053450
A2M,9.518956,8.460140,9.631527,8.104231,11.055351,10.696805,10.781938,11.119107,10.767300,9.192159,...,11.920313,9.498640,12.668290,10.038477,8.878315,10.390422,11.493836,11.667221,10.023463,9.765471
A2M-AS1,5.217719,5.637911,4.637884,4.369238,6.326042,5.988006,5.725365,6.241771,5.587675,4.530469,...,6.444191,4.552161,7.310309,4.909238,4.561397,4.745399,5.751153,5.769769,4.908284,4.671369
A2MP1,6.188802,7.387368,5.729615,7.729516,7.296477,7.883727,7.214071,6.712863,7.739035,5.980618,...,6.467368,5.822306,6.886734,5.763278,5.896677,6.133759,6.868864,7.012643,6.780162,6.398146
AACS,6.216576,6.109275,5.782923,6.159127,5.962652,6.502743,5.960718,6.150082,7.406002,8.647645,...,7.442819,7.129964,6.231053,6.826430,6.218534,6.770478,7.024054,7.638544,7.248299,7.187378
AADAT,6.792089,5.777777,5.253753,6.409544,5.066335,5.742035,4.885224,5.735620,6.969634,7.871836,...,7.352346,5.610672,5.842498,5.697246,6.284140,5.189023,7.042557,6.612581,7.344388,6.806149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,26.435034,26.393904,24.814271,24.836749,24.900537,25.102935,24.946105,26.716375,31.737905,33.288914,...,28.910529,28.631292,29.815066,29.342836,29.223013,28.695542,30.437338,29.561462,30.434940,30.099929
ZYG11B,20.446362,19.787212,20.347999,19.832259,18.811543,18.141120,18.470585,19.121759,25.893403,26.412181,...,27.188745,23.425570,26.241318,25.430921,26.332768,24.256948,26.099550,25.136948,27.468579,26.429893
ZYX,13.506030,14.127335,14.121341,13.042665,13.792625,13.982534,14.010373,14.117730,17.371953,17.497411,...,15.411325,15.481772,15.008652,17.497584,16.119934,14.033331,15.939327,16.480252,16.551470,16.348559
ZZEF1,18.378585,17.808635,17.477718,17.581637,18.676126,18.112217,18.942429,19.496745,23.189925,26.314164,...,20.408743,20.451127,20.796645,21.304203,20.973958,22.011637,22.520715,23.294418,21.438836,21.201364


In [None]:
normalized_gene_data2.loc["A2ML1"]

In [None]:
gene_data