In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import trange, tqdm

In [3]:
from sklearn.datasets import load_diabetes

In [4]:
load_diabetes().keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [5]:
data = load_diabetes()
feature_names = data.feature_names
df = pd.DataFrame(data.data, columns = feature_names)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [7]:

prop_null = 0.1
pr = 'bmi'

data = df.copy()
data.loc[np.random.randint(0,df.shape[0],int(df.shape[0]*prop_null//1)),pr] = None
ind = data[data[pr].isna()].index
cols = list(data.select_dtypes([np.number]).columns) 
cols.remove(pr)
delta_ = np.array(data.fillna(data[pr].mean()).loc[ind, pr] - df.loc[ind, pr])
print('start: ',delta_.mean(),delta_.std())
kmeans = KMeans(n_clusters=1).fit(data.loc[~data.index.isin(ind),cols])
data['cl'] = kmeans.predict(data[cols])
for i in ind:
  data.at[i, pr] = data[data['cl'] == data.at[i,'cl']][pr].mean()
delta = np.abs(np.array(data.loc[ind, pr] - df.loc[ind, pr]))
print('finish: ',delta.mean(),delta.std())     

start:  0.010940947348541733 0.03595385780275364
finish:  0.03207209461178269 0.01958992003549404


In [15]:
data.fillna(data[pr].mean()).loc[ind, pr] - df.loc[ind, pr]

35     0.032035
37    -0.009999
56    -0.040178
57     0.064370
60     0.005090
70     0.070837
77     0.037424
109    0.010479
120   -0.003533
129   -0.015388
150   -0.069279
162   -0.029400
170    0.021257
173    0.080537
176   -0.018622
190    0.013712
197   -0.002455
202   -0.000299
211   -0.035867
216   -0.034789
219    0.042813
222    0.026646
226    0.047125
234   -0.038023
246    0.033113
253   -0.032633
254   -0.055268
267    0.001857
294    0.032035
296    0.062214
308    0.047125
342   -0.020778
348    0.021257
356    0.034191
379    0.039580
384    0.030957
392    0.030957
399   -0.021855
421   -0.015388
426    0.035269
430    0.057903
435    0.024491
Name: bmi, dtype: float64

In [8]:
_=[print(f" {x:4.3f} -> {y:4.3f}  \t{'+' if np.abs(x) > np.abs(y) else '-'}") for x,y in zip(delta_,(df - data).loc[ind,pr])]

 0.032 -> -0.032  	-
 -0.010 -> 0.010  	-
 -0.040 -> 0.040  	-
 0.064 -> -0.064  	-
 0.005 -> -0.005  	+
 0.071 -> -0.071  	-
 0.037 -> -0.037  	-
 0.010 -> -0.010  	-
 -0.004 -> 0.004  	-
 -0.015 -> 0.015  	-
 -0.069 -> 0.069  	-
 -0.029 -> 0.029  	-
 0.021 -> -0.021  	-
 0.081 -> -0.081  	-
 -0.019 -> 0.019  	-
 0.014 -> -0.014  	-
 -0.002 -> 0.002  	+
 -0.000 -> 0.000  	+
 -0.036 -> 0.036  	-
 -0.035 -> 0.035  	-
 0.043 -> -0.043  	-
 0.027 -> -0.027  	-
 0.047 -> -0.047  	-
 -0.038 -> 0.038  	-
 0.033 -> -0.033  	-
 -0.033 -> 0.033  	-
 -0.055 -> 0.055  	-
 0.002 -> -0.002  	-
 0.032 -> -0.032  	-
 0.062 -> -0.062  	-
 0.047 -> -0.047  	-
 -0.021 -> 0.021  	+
 0.021 -> -0.021  	-
 0.034 -> -0.034  	-
 0.040 -> -0.040  	-
 0.031 -> -0.031  	-
 0.031 -> -0.031  	-
 -0.022 -> 0.022  	+
 -0.015 -> 0.015  	-
 0.035 -> -0.035  	-
 0.058 -> -0.058  	-
 0.024 -> -0.024  	-
