## Epigenetic clocks and how to make them

Epigenetic clocks are machine learning models used to predict **age** (chronological or biological) based on DNA methylation.
Here is how that happens.

## DNA Methylation

In [1]:
import pandas as pd

In [53]:
file_path = "GSE40279_series_matrix.txt"

In [54]:
metadata_lines = []
with open(file_path, "rt") as f:
    for line in f:
        if line.startswith("!"):
            metadata_lines.append(line)

for key in metadata_lines[:50]:
    print(key)

!Series_title	"Genome-wide Methylation Profiles Reveal Quantitative Views of Human Aging Rates"

!Series_geo_accession	"GSE40279"

!Series_status	"Public on Nov 21 2012"

!Series_submission_date	"Aug 21 2012"

!Series_last_update_date	"Jul 06 2022"

!Series_pubmed_id	"23177740"

!Series_summary	"Genome wide DNA methylation profiling of individuals across a large age range. The Illumina Infinium 450k Human DNA methylation Beadchip was used to obtain DNA methylation profiles across approximately 450k CpGs from human whole blood."

!Series_overall_design	"Bisulphite converted DNA from the 656 samples were hybridised to the Illumina Infinium 450k Human Methylation Beadchip"

!Series_type	"Methylation profiling by array"

!Series_contributor	"K,,Zhang"

!Series_contributor	"T,,Ideker"

!Series_sample_id	"GSM989827 GSM989828 GSM989829 GSM989830 GSM989831 GSM989832 GSM989833 GSM989834 GSM989835 GSM989836 GSM989837 GSM989838 GSM989839 GSM989840 GSM989841 GSM989842 GSM989843 GSM989844 GSM989845

In [55]:
characteristics_lines = [line for line in metadata_lines if "!Sample_characteristics_ch1" in line]
ages = []
for line in characteristics_lines:
    print(line)
    if "age" in line:
        age_parts = line.split("\t")[1:]
        ages = [l.split('"age (y): ')[1].strip('"') for l in age_parts]

#TODO: Add other metadata
print(ages)

!Sample_characteristics_ch1	"age (y): 67"	"age (y): 89"	"age (y): 66"	"age (y): 64"	"age (y): 62"	"age (y): 87"	"age (y): 73"	"age (y): 75"	"age (y): 73"	"age (y): 83"	"age (y): 82"	"age (y): 48"	"age (y): 77"	"age (y): 54"	"age (y): 63"	"age (y): 71"	"age (y): 68"	"age (y): 80"	"age (y): 92"	"age (y): 60"	"age (y): 79"	"age (y): 68"	"age (y): 93"	"age (y): 84"	"age (y): 74"	"age (y): 74"	"age (y): 76"	"age (y): 56"	"age (y): 72"	"age (y): 75"	"age (y): 79"	"age (y): 73"	"age (y): 82"	"age (y): 74"	"age (y): 79"	"age (y): 54"	"age (y): 101"	"age (y): 79"	"age (y): 84"	"age (y): 85"	"age (y): 87"	"age (y): 74"	"age (y): 77"	"age (y): 73"	"age (y): 87"	"age (y): 85"	"age (y): 88"	"age (y): 66"	"age (y): 78"	"age (y): 53"	"age (y): 76"	"age (y): 67"	"age (y): 73"	"age (y): 54"	"age (y): 73"	"age (y): 96"	"age (y): 57"	"age (y): 76"	"age (y): 47"	"age (y): 54"	"age (y): 73"	"age (y): 84"	"age (y): 62"	"age (y): 89"	"age (y): 68"	"age (y): 62"	"age (y): 79"	"age (y): 72"	"age (y): 69"	"age 

In [57]:
methylation_data = pd.read_csv(file_path, sep="\t", comment="!", skiprows=1, index_col=0)

methylation_data = methylation_data.loc[~methylation_data.index.str.startswith("!"), :]

In [58]:
print(methylation_data.head())

            GSM989827  GSM989828  GSM989829  GSM989830  GSM989831  GSM989832  \
ID_REF                                                                         
cg00000029   0.464197   0.454883   0.485764   0.480785   0.501220   0.499918   
cg00000108   0.941091   0.939033   0.918802   0.929908   0.934548   0.950543   
cg00000109   0.911182   0.596455   0.870333   0.889689   0.890450   0.898493   
cg00000165   0.132014   0.206917   0.162861   0.197780   0.148437   0.224093   
cg00000236   0.717861   0.723935   0.719196   0.704061   0.754913   0.829192   

            GSM989833  GSM989834  GSM989835  GSM989836  ...  GSM990618  \
ID_REF                                                  ...              
cg00000029   0.485852   0.512442   0.518155   0.417986  ...   0.560958   
cg00000108   0.925855   0.941330   0.938528   0.933814  ...   0.934699   
cg00000109   0.893972   0.892010   0.900841   0.883539  ...   0.881957   
cg00000165   0.400489   0.194553   0.134710   0.204569  ...   0.19988

In [59]:
print(len(ages))
print(methylation_data.shape[1])

656
656


In [61]:
methylation_data.to_csv("processed_methylation_data.csv")

In [62]:
print(methylation_data.shape)

(473034, 656)
