# Data imputation

In [14]:
import numpy as np
import pandas as pd
from sklearn.neighbors import VALID_METRICS, KNeighborsRegressor

## kNN 

In [2]:
df = pd.read_csv("data/Meteorite_Landings.csv")
df = df.drop(['GeoLocation'], axis=1)
df = df.rename(columns={'year': 'date', 'mass (g)': 'mass'})
oldIdx = df[df['date'].str.slice(start=6, stop=10).astype(float) <1700].index

In [3]:
df.iloc[oldIdx]

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
174,Castrovillari,5295,Valid,Stone-uncl,15000.0,Fell,01/01/1583 12:00:00 AM,39.8,16.2
278,Elbogen,7823,Valid,"Iron, IID",107000.0,Fell,12/24/1399 12:00:00 AM,50.18333,12.73333
283,Ensisheim,10039,Valid,LL6,127000.0,Fell,12/23/1491 12:00:00 AM,47.86667,7.35
312,Fünen,10838,Valid,Stone-uncl,,Fell,01/01/1654 12:00:00 AM,55.33333,10.33333
369,Hatford,11855,Valid,Stone-uncl,29000.0,Fell,01/01/1628 12:00:00 AM,51.65,-1.51667
410,Jalandhar,12069,Valid,Iron,1967.0,Fell,01/01/1621 12:00:00 AM,31.0,75.0
627,Minamino,16692,Valid,L,1040.0,Fell,01/01/1632 12:00:00 AM,35.07833,136.93333
657,Mount Vaisi,16805,Valid,Stone-uncl,17000.0,Fell,01/01/1637 12:00:00 AM,44.08333,6.86667
679,Narni,16914,Valid,Stone-uncl,,Fell,12/27/0920 12:00:00 AM,42.51667,12.51667
704,Nogata,16988,Valid,L6,472.0,Fell,12/28/0860 12:00:00 AM,33.725,130.75


In [4]:
df = df.drop(oldIdx)
df['date'] = pd.to_datetime(df['date']).dt.date

In [5]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01,50.775,6.08333
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95


In [6]:
df.describe(include='all')

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
count,45694,45694.0,45694,45694,45570.0,45694,45403,38379.0,38379.0
unique,45694,,2,466,,2,243,,
top,Northwest Africa 5483,,Valid,L6,,Found,2003-01-01,,
freq,1,,45619,8284,,44607,3323,,
mean,,26894.679761,,,11956.7,,,-39.168483,61.097057
std,,16862.720822,,,523118.9,,,46.350368,80.653885
min,,1.0,,,0.0,,,-87.36667,-165.43333
25%,,12690.25,,,7.2,,,-76.7147,0.0
50%,,24273.5,,,32.59,,,-71.5,35.66667
75%,,40664.75,,,202.0075,,,0.0,157.16667


In [7]:
def impute(column, df, Regressor=KNeighborsRegressor, **regr_kwargs):
    X = df.drop([column], axis=1)
    Y = df[column]
    
    pure_df = df.dropna()
    pure_x = pure_df.drop([column], axis=1)
    pure_y = pure_df[column]

    regr = Regressor(**regr_kwargs).fit(pure_x, pure_y)
    
    imputable = Y.isna().to_numpy() & X.notna().to_numpy().all(axis=1)
    if imputable.sum() == 0:
        return df, imputable
    x_t = X[imputable]
    y_t = regr.predict(x_t)
    #df.loc[imputable, column] = y_t
    return y_t, imputable, locals()

In [8]:
res, imputable, _ = impute('mass', df.select_dtypes(include=['int', 'float']))
df.loc[imputable, 'mass'] = res
df[imputable]

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
12,Aire-sur-la-Lys,425,Valid,Unknown,6532.20,Fell,1769-01-01,50.66667,2.33333
38,Angers,2301,Valid,L6,4308.20,Fell,1822-01-01,47.46667,-0.55000
76,Barcelona (stone),4944,Valid,OC,17689.00,Fell,1704-01-01,41.36667,2.16667
93,Belville,5009,Valid,OC,13818.20,Fell,1937-01-01,-32.33333,-64.86667
172,Castel Berardenga,5292,Valid,Stone-uncl,12527.20,Fell,1791-01-01,43.35000,11.50000
...,...,...,...,...,...,...,...,...,...
36812,San Luis,23129,Valid,H,125760.00,Found,1964-01-01,-33.33333,-66.38333
38194,Ur,24125,Valid,Iron,43109.00,Found,NaT,30.90000,46.01667
38285,Weiyuan,24233,Valid,Mesosiderite,8789.00,Found,1978-01-01,35.26667,104.31667
41472,Yamato 792768,28117,Valid,CM2,2226.44,Found,1979-01-01,-71.50000,35.66667


In [9]:
dfna = df.dropna()
dfna2 = dfna.copy()

In [10]:
dfna2.loc[:100,'mass'] = np.NaN

In [11]:
res, imputable, d = impute('mass', dfna2.select_dtypes(include=['int', 'float']))
dfna2.loc[imputable, 'mass'] = res
dfna2[imputable]

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
0,Aachen,1,Valid,L5,1618.84,Fell,1880-01-01,50.77500,6.08333
1,Aarhus,2,Valid,H6,1618.84,Fell,1951-01-01,56.18333,10.23333
2,Abee,6,Valid,EH4,14260.20,Fell,1952-01-01,54.21667,-113.00000
3,Acapulco,10,Valid,Acapulcoite,14260.20,Fell,1976-01-01,16.88333,-99.90000
4,Achiras,370,Valid,L6,12321.32,Fell,1902-01-01,-33.16667,-64.95000
...,...,...,...,...,...,...,...,...,...
96,Beni M'hira,5018,Valid,L6,21380.00,Fell,2001-01-01,32.86667,10.80000
97,Benld,5021,Valid,H6,11714.68,Fell,1938-01-01,39.08333,-89.15000
98,Benoni,5023,Valid,H6,8772.20,Fell,1943-01-01,-26.16667,28.41667
99,Bensour,5024,Valid,LL6,10664.60,Fell,2002-01-01,30.00000,-7.00000


In [12]:
dfna[imputable]

Unnamed: 0,name,id,nametype,recclass,mass,fall,date,reclat,reclong
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01,50.77500,6.08333
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.00000
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.90000
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95000
...,...,...,...,...,...,...,...,...,...
96,Beni M'hira,5018,Valid,L6,19000.0,Fell,2001-01-01,32.86667,10.80000
97,Benld,5021,Valid,H6,1770.5,Fell,1938-01-01,39.08333,-89.15000
98,Benoni,5023,Valid,H6,3880.0,Fell,1943-01-01,-26.16667,28.41667
99,Bensour,5024,Valid,LL6,45000.0,Fell,2002-01-01,30.00000,-7.00000


In [13]:
d['regr'].score(d['X'][imputable], dfna.loc[imputable,'mass'])

-77.55635328703747