Filling the missing values using Kmeans clustering

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from scipy.spatial.distance import cityblock

In [2]:
# Load the data
train = pd.read_csv('merged_train.csv')
test = pd.read_csv('test_data.csv')

horo = train['Horodate'] #we will need the index later
train=train.drop(columns=['Horodate'])
test=test.drop(columns=['Horodate'])

In [3]:
# Step 1: Standardize the fully observed users (important for clustering)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train.T)  # Transpose: shape (users × timestamps)


In [4]:
# Remark: we use the L2 distance to compute the clusters
# Step 2: Apply MiniBatchKMeans
num_clusters = 300 # Adjust based on data structure
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, n_init=10, batch_size=1000)
clusters = kmeans.fit_predict(train_scaled)

In [5]:
# Step 3: Compute cluster-wise mean consumption
cluster_means = {}
for i in range(num_clusters):
    cluster_means[i] = train.iloc[:, clusters == i].mean(axis=1)  # Mean per timestamp

In [6]:
# Step 4: Assign missing users to the nearest cluster using L1 distance
test_int = test.copy()
test_int.interpolate(method='linear', axis=0, inplace=True)
test_int.fillna(test_int.mean(), inplace=True)

test_scaled = scaler.transform(test_int.T)

# Assign clusters based on Manhattan (L1) distance
missing_clusters = []
for sample in test_scaled:
    distances = [cityblock(sample, center) for center in kmeans.cluster_centers_]
    missing_clusters.append(np.argmin(distances))
missing_clusters = np.array(missing_clusters)

In [7]:
# Step 5: Fill missing values using cluster means
for i, cluster_id in enumerate(missing_clusters):
    test.iloc[:, i] = test.iloc[:, i].fillna(cluster_means[cluster_id])

In [8]:
test.index = horo
test.to_csv("kmeans_mean_fill100.csv", index=True)