In [29]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import torch
from tqdm import tqdm

In [30]:
# The different imputation strategies
def half_min_imputer(X):
    M0 = pd.DataFrame(X)
    mins=M0.min(axis=1)/2
    for i,j in zip(np.where(np.isnan(M0))[0], np.where(np.isnan(M0))[1]):
        M0.iloc[i,j]=mins[i]
    return (M0.values)

In [31]:
def knn_imputer(X, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    X_imputed = imputer.fit_transform(X)
    return(X_imputed)

In [32]:
def als_imputer(X, l=20, regularization=10**2, iterations=100, return_repr = False):
    W = np.isfinite(X)
    X = np.nan_to_num(X)
    X0, Y0 = run_weighted_als(X, W, l, iterations, regularization, X0='rnd', Y0='rnd', seed=0)
    X_imputed = np.matmul(X0.T, Y0)
    if return_repr:
        pass
    else:
        X_imputed[W==1] = X[W==1]
    return(X_imputed)
    
    
def run_weighted_als(X,W,l,iterations,regularization, X0='rnd', Y0='rnd', seed=0):
    n0,n1=X.shape
    #X=torch.tensor(X).to_gpu()
    #W=torch.tensor(W).to_gpu()
    
    if str(type(X0))=="<class 'str'>" and str(type(Y0))=="<class 'str'>":
        np.random.seed(seed)
        X1=np.zeros((l,n0))
        Y0=np.random.rand(l,n1)
        Y1=np.zeros((l,n1))
    elif str(type(X0))=="<class 'str'>" or str(type(Y0))=="<class 'str'>":
        print('Please provide both X0 and Y0')
    else:
        X1=np.zeros((l,n0))
        Y1=np.zeros((l,n1))
        
    
    X=torch.tensor(X).cuda().float()
    W=torch.tensor(W).cuda().float()
    Y0=torch.tensor(Y0).cuda().float()
    
    
    for k in tqdm(range(iterations)):
        X0=iterate_gpu(n0, X, W, Y0, l, regularization, 0)
        Y0=iterate_gpu(n1, X, W, X0, l, regularization, 1)
    X0=X0.cpu().numpy()
    Y0=Y0.cpu().numpy()
        
    return(X0,Y0)

# All of them should be tensors
def iterate_gpu(n0, X, W, Y0, l, regularization, pos):
    X1=torch.zeros((l,n0)).cuda()
    l0=regularization*torch.diag(torch.ones(l)).cuda()
    for i in range(n0):
        #print(i)
        if pos==0:
            cd=W[i,:]
            pd=X[i,:]
        elif pos==1:
            cd=W[:,i]
            pd=X[:,i]
        
        cY0T=Y0.T * cd[:, np.newaxis]
        M=torch.matmul(Y0,cY0T)+l0
        b=torch.matmul(Y0,pd*cd)
        X1[:,i]=torch.linalg.solve(M,b)
    return(X1)

In [33]:
def impute_normal_down_shift_distribution(X ,column_wise=True, width=0.3, downshift=1.8, seed=2):
    unimputerd_dataframe = pd.DataFrame(X.T)
    """ 
    Performs imputation across a matrix columnswise
    https://rdrr.io/github/jdreyf/jdcbioinfo/man/impute_normal.html#google_vignette
    :width: Scale factor for the standard deviation of imputed distribution relative to the sample standard deviation.
    :downshift: Down-shifted the mean of imputed distribution from the sample mean, in units of sample standard deviation.
    :seed: Random seed
    
    """
    unimputerd_df = unimputerd_dataframe.iloc[:,:]

    unimputerd_matrix = unimputerd_df.replace({pd.NA: np.nan}, inplace=True) #Added to modify pandas's NAN values into  numpy NAN values
    
    unimputerd_matrix = unimputerd_df.to_numpy()
    columns_names = unimputerd_df.columns
    rownames = unimputerd_df.index
    unimputerd_matrix[~np.isfinite(unimputerd_matrix)] = None
    main_mean = np.nanmean(unimputerd_matrix)
    main_std = np.nanstd(unimputerd_matrix)
    np.random.seed(seed = seed)
    def impute_normal_per_vector(temp:np.ndarray,width=width, downshift=downshift):
        """ Performs imputation for a single vector """
        if column_wise:
            temp_sd = np.nanstd(temp)
            temp_mean = np.nanmean(temp)
        else:
            # over all matrix
            temp_sd = main_std
            temp_mean = main_mean

        shrinked_sd = width * temp_sd
        downshifted_mean = temp_mean - (downshift * temp_sd) 
        n_missing = np.count_nonzero(np.isnan(temp))
        temp[np.isnan(temp)] = np.random.normal(loc=downshifted_mean, scale=shrinked_sd, size=n_missing)
        if n_missing > 0:
            print 
        return temp
    final_matrix = np.apply_along_axis(impute_normal_per_vector, 0, unimputerd_matrix)
    final_df = pd.DataFrame(final_matrix)
    final_df.index = rownames
    final_df.columns = columns_names
    #final_df = pd.concat([unimputerd_dataframe.iloc[:,:],final_df], axis=1) 
    
    return final_df.values.T

In [34]:
def comp_imputation(X, imputation_method = 'knn', n_neighbors=5, l=2, regularization=10**0, iterations=20, return_repr=False):
    X0 = X.copy()
    
    if imputation_method=='knn':
        X_imputed_full = knn_imputer(X0, n_neighbors=n_neighbors)
    elif imputation_method=='als':
        X_imputed_full = als_imputer(X0, l=l, regularization=regularization, iterations=iterations, return_repr=return_repr)
    elif imputation_method=='half_min':
        X_imputed_full = half_min_imputer(X0)
    elif imputation_method=='lower_normal':
        X_imputed_full = impute_normal_down_shift_distribution(X0)
    elif imputation_method=='zeros':
        X_imputed_full = np.nan_to_num(X0)
        
    return (X_imputed_full)

In [35]:
Path = '/home/icb/manuel.gander/Atl/data/'
M0 = pd.read_pickle(f'{Path}/M0_new.pkl')
M1 = pd.read_pickle(f'{Path}/M1_new.pkl')

In [None]:
fr = 0.5
M0 = M0[np.isfinite(M0).sum(1)>fr*M0.shape[1]].copy()
M1 = M1[np.isfinite(M1).sum(1)>fr*M1.shape[1]].copy()

In [36]:
M0

Unnamed: 0,786O_NCI60,A498_NCI60,A549_NCI60,ACHN_NCI60,BT549_NCI60,C106[HUMANRECTALADENOCARCINOMA]_CRC65,C10_CRC65,C125PM_CRC65,WT2IPS_CRC65,C70_CRC65,...,SW837_CRC65,SW948_CRC65,T47D_NCI60,T84_CRC65,TK10_NCI60,UO31_NCI60,U251MG_NCI60,UACC257_NCI60,UACC62_NCI60,VACO4A_CRC65
_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGRGS(Phospho (STY))GPGR_,,,,,0.978104,,,,,0.945930,...,,0.915419,,,,,,,,
_(Acetyl (Protein N-term))AAAAAAAGDS(Phospho (STY))DSWDADAFSVEDPVRK_,1.04749,1.025022,1.086681,1.065761,,,,,,,...,0.959436,0.970695,,,,,1.095909,,,0.948513
_(Acetyl (Protein N-term))AAAAAAAGDS(Phospho (STY))DSWDADAFSVEDPVR_,,,,,,,,,,,...,,0.965274,,,,,1.036433,1.012036,,
_(Acetyl (Protein N-term))AAAAAAAGDSDS(Phospho (STY))WDADAFSVEDPVRK_,,,,,,,,,,,...,,,,,,,,,,
_(Acetyl (Protein N-term))AAAAAAGS(Phospho (STY))GTPR_,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
_YYYSDNFFDGQR_,,,,,,,,,,,...,0.955076,,,,,,,,,
_YYYVPADFVEYEK_,,,,,,,,,,,...,,,,,,,,,,
_YYYVQNVYTPVDEHVYPDHR_,,,,,,,,,,,...,,,,,,,,,,
_YYYWAVNPQDR_,,,,,,,,,,0.818923,...,,,,,,,,,,


In [37]:
fr = 0.5

In [38]:
M0[np.isfinite(M0).sum(1)>fr*M0.shape[1]].copy()

Unnamed: 0,786O_NCI60,A498_NCI60,A549_NCI60,ACHN_NCI60,BT549_NCI60,C106[HUMANRECTALADENOCARCINOMA]_CRC65,C10_CRC65,C125PM_CRC65,WT2IPS_CRC65,C70_CRC65,...,SW837_CRC65,SW948_CRC65,T47D_NCI60,T84_CRC65,TK10_NCI60,UO31_NCI60,U251MG_NCI60,UACC257_NCI60,UACC62_NCI60,VACO4A_CRC65
_(Acetyl (Protein N-term))AAAAPDSRVS(Phospho (STY))EEENLKK_,0.976527,1.057791,1.054388,0.978725,1.001171,1.034179,1.026931,1.088462,1.061566,1.094806,...,1.104020,1.035869,0.963763,1.037572,1.000599,0.889982,0.893031,1.061776,0.994139,1.048033
_(Acetyl (Protein N-term))AADVSVTHRPPLS(Phospho (STY))PK_,1.223542,1.202007,1.200904,1.230707,1.198315,1.224232,1.259685,1.270776,1.260778,1.262135,...,1.231507,1.219763,1.193633,1.216537,1.196641,1.198910,1.191779,1.243015,1.229541,1.216584
_(Acetyl (Protein N-term))AAGGDHGS(Phospho (STY))PDSYR_,1.078057,1.005695,1.052447,1.071657,1.086833,1.035705,1.127805,1.132475,1.154335,1.120966,...,1.099563,1.063837,1.092430,1.078036,1.031596,1.048199,1.100567,1.116600,1.059064,1.113719
_(Acetyl (Protein N-term))AALHTTPDS(Phospho (STY))PAAQLER_,1.104667,1.068233,1.051667,1.093339,1.088754,1.184236,1.225934,1.216141,1.209548,1.199058,...,1.174590,1.189718,1.111037,1.231250,1.139549,1.091763,1.159258,1.144764,1.142235,1.216607
_(Acetyl (Protein N-term))AAPCGSELPANS(Phospho (STY))PLKIPK_,1.070899,,1.072016,,1.035107,,1.023428,1.019477,1.005958,1.030077,...,,,1.074596,1.024865,1.009433,1.086807,1.078162,,,1.005198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
_YYTLFGR_,1.010620,1.010768,1.012125,1.049699,1.016403,1.094778,1.044611,1.100588,1.062691,1.049033,...,1.064545,1.034938,1.064740,1.022214,1.025650,1.003201,,0.981449,1.038932,1.059026
_YYTSASGDEM(Oxidation (M))VSLK_,,,,0.988258,,0.974976,0.947751,0.934989,0.998613,0.905454,...,0.931198,0.919863,,0.885936,0.960309,,,,,0.900082
_YYTVFDR_,1.100830,1.128398,1.079222,1.056036,0.987953,1.149558,1.125052,1.065194,0.996135,1.163393,...,1.039867,1.087775,1.162621,1.111080,1.072521,1.094906,,1.073754,1.078480,1.137661
_YYVTIIDAPGHR_,1.116628,1.107500,1.160984,1.147736,1.122919,1.231098,1.216995,1.185549,1.226012,1.200297,...,1.147666,1.159468,1.153718,1.160728,1.154399,1.144184,1.149871,1.191496,1.179294,1.149670


In [39]:
pd.read_pickle('knn_imp_M0.pkl')

Unnamed: 0,786O_NCI60,A498_NCI60,A549_NCI60,ACHN_NCI60,BT549_NCI60,C106[HUMANRECTALADENOCARCINOMA]_CRC65,C10_CRC65,C125PM_CRC65,WT2IPS_CRC65,C70_CRC65,...,SW837_CRC65,SW948_CRC65,T47D_NCI60,T84_CRC65,TK10_NCI60,UO31_NCI60,U251MG_NCI60,UACC257_NCI60,UACC62_NCI60,VACO4A_CRC65
_(Acetyl (Protein N-term))AAAAPDSRVS(Phospho (STY))EEENLKK_,0.976527,1.057791,1.054388,0.978725,1.001171,1.034179,1.026931,1.088462,1.061566,1.094806,...,1.104020,1.035869,0.963763,1.037572,1.000599,0.889982,0.893031,1.061776,0.994139,1.048033
_(Acetyl (Protein N-term))AADVSVTHRPPLS(Phospho (STY))PK_,1.223542,1.202007,1.200904,1.230707,1.198315,1.224232,1.259685,1.270776,1.260778,1.262135,...,1.231507,1.219763,1.193633,1.216537,1.196641,1.198910,1.191779,1.243015,1.229541,1.216584
_(Acetyl (Protein N-term))AAGGDHGS(Phospho (STY))PDSYR_,1.078057,1.005695,1.052447,1.071657,1.086833,1.035705,1.127805,1.132475,1.154335,1.120966,...,1.099563,1.063837,1.092430,1.078036,1.031596,1.048199,1.100567,1.116600,1.059064,1.113719
_(Acetyl (Protein N-term))AALHTTPDS(Phospho (STY))PAAQLER_,1.104667,1.068233,1.051667,1.093339,1.088754,1.184236,1.225934,1.216141,1.209548,1.199058,...,1.174590,1.189718,1.111037,1.231250,1.139549,1.091763,1.159258,1.144764,1.142235,1.216607
_(Acetyl (Protein N-term))AAPCGSELPANS(Phospho (STY))PLKIPK_,1.070899,1.052665,1.072016,1.048278,1.035107,1.015392,1.023428,1.019477,1.005958,1.030077,...,1.033446,1.011494,1.074596,1.024865,1.009433,1.086807,1.078162,1.058991,1.032722,1.005198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
_YYTLFGR_,1.010620,1.010768,1.012125,1.049699,1.016403,1.094778,1.044611,1.100588,1.062691,1.049033,...,1.064545,1.034938,1.064740,1.022214,1.025650,1.003201,1.026984,0.981449,1.038932,1.059026
_YYTSASGDEM(Oxidation (M))VSLK_,0.942249,0.966453,0.980524,0.988258,0.956708,0.974976,0.947751,0.934989,0.998613,0.905454,...,0.931198,0.919863,0.976233,0.885936,0.960309,0.962513,0.955094,0.969252,0.965162,0.900082
_YYTVFDR_,1.100830,1.128398,1.079222,1.056036,0.987953,1.149558,1.125052,1.065194,0.996135,1.163393,...,1.039867,1.087775,1.162621,1.111080,1.072521,1.094906,1.074464,1.073754,1.078480,1.137661
_YYVTIIDAPGHR_,1.116628,1.107500,1.160984,1.147736,1.122919,1.231098,1.216995,1.185549,1.226012,1.200297,...,1.147666,1.159468,1.153718,1.160728,1.154399,1.144184,1.149871,1.191496,1.179294,1.149670


## KNN

In [8]:
df0 = M0.copy()

imputation_method = 'knn'
n=20
X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, n_neighbors=n)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('knn_imp_M0.pkl')

In [9]:
df0 = M1.copy()

imputation_method = 'knn'
n=20
X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, n_neighbors=n)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('knn_imp_M1.pkl')

## ALS

In [10]:
imputation_method = 'als'
l=20
regularization = 1
df0 = M0.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, regularization=regularization, l=l)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('als_imp_M0.pkl')

100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:33<00:00,  4.66s/it]


In [11]:
imputation_method = 'als'
l=20
regularization = 10
df0 = M1.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, regularization=regularization, l=l)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('als_imp_M1.pkl')

100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:45<00:00,  2.27s/it]


In [12]:
imputation_method = 'als'
l=20
regularization = 1
df0 = M0.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, regularization=regularization, 
                        l=l, return_repr=True)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('fals_imp_M0.pkl')

100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [02:10<00:00,  6.51s/it]


In [13]:
imputation_method = 'als'
l=20
regularization = 10
df0 = M1.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method, regularization=regularization, 
                        l=l, return_repr=True)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('fals_imp_M1.pkl')

100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:53<00:00,  2.68s/it]


## Half min

In [14]:
imputation_method = 'half_min'
df0 = M0.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('hlm_imp_M0.pkl')

In [15]:
imputation_method = 'half_min'
df0 = M1.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('hlm_imp_M1.pkl')

## Lower Normal

In [16]:
imputation_method = 'lower_normal'
df0 = M0.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('lwn_imp_M0.pkl')

In [17]:
imputation_method = 'lower_normal'
df0 = M1.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('lwn_imp_M1.pkl')

## Zeros

In [18]:
imputation_method = 'zeros'
df0 = M0.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('zer_imp_M0.pkl')

In [19]:
imputation_method = 'zeros'
df0 = M1.copy()

X_new = comp_imputation(df0.values.copy(), imputation_method = imputation_method)

dfn = pd.DataFrame(data=X_new, columns=df0.columns, index=df0.index)
dfn.to_pickle('zer_imp_M1.pkl')