In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Vitamin_D_Levels/GSE118723'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Discovery and characterization of variance QTLs in human induced pluripotent stem cells"
!Series_summary	"Quantification of gene expression levels at the single cell level has revealed that gene expression can vary substantially even across a population of homogeneous cells. However, it is currently unclear what genomic features control variation in gene expression levels, and whether common genetic variants may impact gene expression variation. Here, we take a genome-wide approach to identify expression variance quantitative trait loci (vQTLs). To this end, we generated single cell RNA-seq (scRNA-seq) data from induced pluripotent stem cells (iPSCs) derived from 53 Yoruba individuals. We collected data for a median of 95 cells per individual and a total of 5,447 single cells, and identified 241 mean expression QTLs (eQTLs) at 10% FDR, of which 82% replicate in bulk RNA-seq data from the same individuals. We further identified 14 vQTLs at 10% FDR,

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Analyze the sample characteristics to determine the availability of the desired variables
sample_characteristics = {
    0: ['experiment: 02192018', 'experiment: 02202018', 'experiment: 02212018', 'experiment: 02222018', 'experiment: 02242018', 'experiment: 02262018', 'experiment: 02272018', 'experiment: 02282018', 'experiment: 03012018', 'experiment: 03052018', 'experiment: 03062018', 'experiment: 03072018', 'experiment: 03162017', 'experiment: 03172017', 'experiment: 03232017', 'experiment: 03302017', 'experiment: 03312017', 'experiment: 04052017', 'experiment: 04072017', 'experiment: 04132017', 'experiment: 04142017', 'experiment: 04202017', 'experiment: 08102017', 'experiment: 08112017', 'experiment: 08142017', 'experiment: 08152017', 'experiment: 08162017', 'experiment: 08182017', 'experiment: 08212017', 'experiment: 08222017'],
    1: ['well: A01', 'well: A02', 'well: A03', 'well: A04', 'well: A05', 'well: A06', 'well: A07', 'well: A08', 'well: A09', 'well: A10', 'well: A11', 'well: A12', 'well: B01', 'well: B02', 'well: B03', 'well: B04', 'well: B05', 'well: B06', 'well: B07', 'well: B08', 'well: B09', 'well: B10', 'well: B11', 'well: B12', 'well: C01', 'well: C02', 'well: C03', 'well: C04', 'well: C05', 'well: C06'],
    2: ['individual: NA18517', 'individual: NA18913', 'individual: NA19210', 'individual: NA19193', 'individual: NA19204', 'individual: NA19159', 'individual: NA19203', 'individual: NA19143', 'individual: NA18505', 'individual: NA19098', 'individual: NA19185', 'individual: NA19130', 'individual: NA18871', 'individual: NA18917', 'individual: NA18507', 'individual: NA18523', 'individual: NA18859', 'individual: NA18519', 'individual: NA18522', 'individual: NA19128', 'individual: NA18520', 'individual: NA18852', 'individual: NA19131', 'individual: NA19209', 'individual: NA18502', 'individual: NA18856', 'individual: NA19092', 'individual: NA18873', 'individual: NA19214', 'individual: NA18862'],
    3: ['batch: b6', 'batch: b1', 'batch: b2', 'batch: b3', 'batch: b4', 'batch: b5']
}

# Explicitly from sample characteristics data to determine data's non-existence
trait_row = None
age_row = None
gender_row = None

# Define conversion functions for each variable; currently not applicable as their rows are not found.
def convert_trait(value):
    try:
        return float(value.split(":")[1])
    except Exception:
        return None

def convert_age(value):
    try:
        return float(value.split(":")[1])
    except Exception:
        return None

def convert_gender(value):
    try:
        gender = value.split(":")[1].strip().lower()
        if gender == "male":
            return 1
        elif gender == "female":
            return 0
        else:
            return None
    except Exception:
        return None

# Save cohort information
save_cohort_info('GSE118723', './preprocessed/Vitamin_D_Levels/cohort_info.json', is_gene_available, trait_row is not None)

# Since trait_row is None, skip clinical feature extraction step


A new JSON file was created at: ./preprocessed/Vitamin_D_Levels/cohort_info.json
