In [1]:
from KNN_KDE_Bagging_Imputer import KKBImputer
import matplotlib.pyplot as plt
from utils import select_param_rmse_b,knnxkde_param_rmse
from utils import introduce_miss
from utils import normalization,renormalization
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from knnxkde import KNNxKDE
from sklearn.model_selection import GridSearchCV, KFold
import time

# Abalone

In [2]:
np.random.seed(42)
df = pd.read_csv('abalone.csv',header = None)
ab_origin = df.values
ab_test = ab_origin[-400:].copy()#400 条测试集，3777条训练集，2266条完整数据，1511缺失数据
ab_com,ab_miss = introduce_miss(ab_origin[:-400],0.6)
ab_train = np.vstack((ab_com,ab_miss))

In [3]:
norm_miss_data,norm_params = normalization(ab_train)
norm_test_data,_ = normalization(ab_test,parameters=norm_params)

In [6]:
np.random.seed(42)
select_param_rmse_b(norm_miss_data[:2266],norm_miss_data[2266:],
                    [5,10,15],#B
                    [0.6,0.7,0.8,0.85,0.9],#s
                    [0.001,0.005,0.01,0.05,0.1],#k
                   [0.0005,0.001,0.003,0.005,0.01],#h
                    [0.003])

B: 5 => B: 10 => B: 15 => {'B': 5, 's': 0.6, 'k': 0.001, 'h': 0.001, 'K': 0.003} 0.03301387712623689


({'B': 5, 's': 0.6, 'k': 0.001, 'h': 0.001, 'K': 0.003}, 0.03301387712623689)

In [4]:
np.random.seed(42)
result = []
training_duration = []
for i in range(20):
    start_time = time.time()
    imputer = KKBImputer(B=5, s_ratio=0.6, n_neighbors_ratio=0.001, h=0.001)
    imp_data_f= imputer.impute_pattern(norm_miss_data)
    best_k = int(round(imp_data_f.shape[0]*0.003))
    knn_regressor = KNeighborsRegressor(n_neighbors=best_k)
    knn_regressor.fit(imp_data_f[:,:-1],imp_data_f[:,-1])
    end_time = time.time()
    training_duration.append(end_time - start_time)
    y_pred = knn_regressor.predict(norm_test_data[:,:-1])
    result.append(np.sqrt(mean_squared_error(norm_test_data[:,-1], y_pred)))
print('the average rmse using knn_kde_bagging for normalized data is ',sum(result)/len(result))
print("Imputing and Training took average {:.2f} seconds".format(sum(training_duration)/len(training_duration)))

the average rmse using knn_kde_bagging for normalized data is  0.039687338926994144
Imputing and Training took average 1.19 seconds


In [8]:
np.random.seed(42)
knnxkde_param_rmse(norm_miss_data[:2266],norm_miss_data[2266:],
                   [10.0,50.0,100.0,250.0,500.0,800.0, 1000.0,1100.0],
                   [0.0005,0.001,0.005,0.01,0.03],
                   [0.001,0.003,0.009,0.012])

tau: 10.0 => tau: 50.0 => tau: 100.0 => tau: 250.0 => tau: 500.0 => tau: 800.0 => tau: 1000.0 => tau: 1100.0 => 

({'tau': 1100.0, 'h': 0.0005, 'K': 0.003}, 0.032920182716144386)

In [5]:
np.random.seed(42)
rmse = []
training_duration = []
for i in range(20):
    start_time = time.time()
    m_data = norm_miss_data.copy()
    knnxkde = KNNxKDE(h=0.0005, tau=1.0/1100.0, metric='nan_std_eucl')
    imputed_samples = knnxkde.impute_samples(norm_miss_data, nb_draws=1)

    for (row, col), value in imputed_samples.items():
        m_data[row, col] = value[0]
    best_k = int(round(m_data.shape[0]*0.003))
    knn_regressor = KNeighborsRegressor(n_neighbors=best_k)
    knn_regressor.fit(m_data[:,:-1],m_data[:,-1])
    end_time = time.time()
    training_duration.append(end_time - start_time)
    y_pred = knn_regressor.predict(norm_test_data[:,:-1])
    rmse.append(np.sqrt(mean_squared_error(norm_test_data[:,-1], y_pred)))
print('average rmse using knn_kde for normalized data is',sum(rmse)/len(rmse))
print("Imputinga and Training took average {:.2f} seconds".format(sum(training_duration)/len(training_duration)))

average rmse using knn_kde for normalized data is 0.0402024961041964
Imputinga and Training took average 0.87 seconds


In [6]:
knn_regressor = KNeighborsRegressor()
param_grid = {'n_neighbors': np.arange(1, 25)}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn_regressor, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(norm_miss_data[:2266,:-1],norm_miss_data[:2266,-1])
best_k = grid_search.best_params_['n_neighbors']
best_score = -grid_search.best_score_
print(f'Best k: {best_k}, Best Mean Squared Error: {best_score}')
best_model = grid_search.best_estimator_

Best k: 5, Best Mean Squared Error: 0.0010979228352917276


In [7]:
best_k = 5 # Replace with the best k value from your grid search
knn_regressor = KNeighborsRegressor(n_neighbors=best_k)

# Train the model on the training data
knn_regressor.fit(norm_miss_data[:2266,:-1],norm_miss_data[:2266,-1])

# Make predictions on the new data
y_pred = knn_regressor.predict(norm_test_data[:,:-1])
print('rmse without any imputation for normalized data',np.sqrt(mean_squared_error(norm_test_data[:,-1], y_pred)))

rmse without any imputation for normalized data 0.04084028699751879
