# K近傍法により欠損値を補完する

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [24]:
filepath = '../../data/processed/df_filtered_5years.xlsx'
df = pd.read_excel(filepath)
df = df[df['year'] >= 1995]

In [25]:
pivoted_df = df.pivot(index='year', columns='island_id', values=['population', 'dummy_after_bridge_opened', 'income'])

In [26]:
missing_all_years = pivoted_df.columns[pivoted_df.isnull().all(axis=0)]
print(missing_all_years)

MultiIndex([], names=[None, 'island_id'])


In [27]:
# Step 4: Apply KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputed_data = imputer.fit_transform(pivoted_df)

print(pivoted_df.shape)
print(imputed_data.shape)

(6, 462)
(6, 462)


In [30]:
# Step 5: Convert back to dataframe
imputed_df = pd.DataFrame(imputed_data, columns=pivoted_df.columns, index=pivoted_df.index)

imputed_df['population']

island_id,0,1,2,3,4,6,7,8,9,10,...,153,154,155,156,157,158,160,161,162,163
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995,15415.5,2857.0,4048.0,802.0,4597.0,45.0,154.0,550.0,51.0,1432.0,...,998.0,117.0,379.0,394.5,44.5,62.0,370.5,502.0,48.5,214.5
2000,15415.5,2672.0,3804.0,718.0,4583.0,28.5,136.0,475.5,47.0,1077.0,...,998.0,117.0,379.0,394.5,44.5,62.0,370.5,502.0,48.5,214.5
2005,17259.0,2406.0,3607.0,744.0,3957.0,13.0,145.0,463.0,40.0,425.0,...,1234.0,182.0,473.0,494.0,67.0,62.0,370.5,502.0,59.0,284.0
2010,15930.0,2378.0,3238.0,640.0,4072.0,12.0,118.0,401.0,43.0,425.0,...,1067.0,134.0,414.0,422.0,56.0,62.0,457.0,575.0,54.0,239.0
2015,14901.0,2304.0,3017.0,586.0,3842.0,10.0,102.0,404.0,56.0,486.0,...,929.0,100.0,344.0,367.0,33.0,67.0,284.0,502.0,43.0,190.0
2020,13882.0,2184.0,2758.0,632.0,3793.0,11.0,82.0,362.0,58.0,364.0,...,729.0,71.0,263.0,302.0,24.0,57.0,225.0,429.0,38.0,150.0


## 1次元に直す

In [33]:
imputed_df = imputed_df.stack(level=1).reset_index()

  imputed_df = imputed_df.stack(level=1).reset_index()


## 保存

In [35]:
output_filepath = '../../data/processed/imputed_df.xlsx'
imputed_df.to_excel(output_filepath, index=False)