# ANCOVA analysis

In [1]:
# include a PCA colored by groups as well as covariance factors 
# This is a new update

Import data.

In [2]:
import pandas as pd
import acore.differential_regulation as ad
from pathlib import Path
import numpy as np

folder_out = Path("data")

In [3]:
CLINIC: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv'  # clincial data
OMICS: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv'  # omics data

In [4]:
clinic = pd.read_csv(CLINIC, index_col=0).convert_dtypes()
omics = pd.read_csv(OMICS, index_col=0)

In [None]:
clinic

In [None]:
omics

### Filtering data

If data is already filtered and/or imputed, skip this step. 

In [7]:
# Filtering parameters
freq_cutoff = 0.7


In [None]:
M_before = omics.shape[1]
omics = omics.dropna(thresh=int(len(omics) * freq_cutoff), axis=1)
M_after = omics.shape[1]
msg = (
    f"Removed {M_before-M_after} features with more than {freq_cutoff*100}% missing values." # if theres 100 feat with >30% missing, how can there be 400 feat with >70% 
    f"\nRemaining features: {M_after} (of {M_before})")
print(msg)
# keep a map of all proteins in protein group, but only display first protein
# proteins are unique to protein groups
pg_map = {k: k.split(";")[0] for k in omics.columns}
omics = omics.rename(columns=pg_map)
# log2 transform raw intensity data:
omics = np.log2(omics + 1)
omics

Consider replacing with the filter from the acore package!

### Preparing metadata

In [None]:
clinic['age'].info()

In [None]:
clinic_omics = omics.join(clinic)
clinic_omics

In [11]:
omics_group = clinic_omics.drop(columns = ['Kiel','Magdeburg','Sweden','male','age'])

In [None]:
omics_group

### Checking missing data

In [None]:
data_completeness = omics_group.groupby("AD").count().divide(clinic['AD'].value_counts(), axis=0)
data_completeness

In [None]:
data_completeness.T.sort_values(0).plot(style='.',ylim=(0,1))

### Running ANCOVA analysis

In [None]:
clinic_omics

In [None]:
clinic_omics.index.to_series().info() 

In [None]:
clinic_omics.dtypes.value_counts()

In [None]:
col='A0A024QZX5'
group="AD"
covariates=['male',]
clinic_omics[[group, col] + covariates]

In [None]:
ad.calculate_ancova(clinic_omics.astype('float'), column='A0A024QZX5', group="AD", covariates=['male',])

In [None]:
clinic_omics

In [None]:
clinic_omics = clinic_omics.astype(float) # this is no needed for run_ancova (the regex where groups are joined)
ad.calculate_ancova(clinic_omics, column='A0A024QZX5', group="AD", covariates=['male',])

In [25]:

ancova = ad.run_ancova(
                        clinic_omics.astype({'AD':str}),
                        # subject='Sample ID', # not used
                        drop_cols=['Kiel','Magdeburg','Sweden','age',
                                   ],
                        group='AD', # needs to be a string 
                        covariates=['male',]) # need to be floats?

In [25]:
anova = ad.run_anova(clinic_omics.reset_index(),
                        subject='Sample ID',
                        drop_cols=['Kiel','Magdeburg','Sweden','age','male'],
                        group='AD')

### Running ANOVA analysis (optional)

### Comparing ANOVA and ANCOVA results