In [2]:
import pandas as pd

In [None]:
df_small_rich = pd.read_csv('../data/processed/dataset_small_rich.csv')   # (100, 68)
df = pd.read_csv('../data/processed/dataset_large_rich.csv')   # (5000, 68) original

In [4]:
df_small_rich.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 68 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_code               100 non-null    object 
 1   city_name               100 non-null    object 
 2   avg_salary              100 non-null    float64
 3   avg_salary_female       100 non-null    float64
 4   avg_salary_male         100 non-null    float64
 5   log_gender_gap          100 non-null    float64
 6   population_total        100 non-null    int64  
 7   population_30_44        100 non-null    float64
 8   population_female       100 non-null    float64
 9   population_cs3          100 non-null    float64
 10  latitude                100 non-null    float64
 11  longitude               100 non-null    float64
 12  vote_macron_share       100 non-null    float64
 13  Ville                   100 non-null    object 
 14  Chomage                 100 non-null    flo

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5263 entries, 0 to 5262
Data columns (total 68 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_code               5263 non-null   object 
 1   city_name               5263 non-null   object 
 2   avg_salary              5263 non-null   float64
 3   avg_salary_female       5263 non-null   float64
 4   avg_salary_male         5263 non-null   float64
 5   log_gender_gap          5263 non-null   float64
 6   population_total        5263 non-null   int64  
 7   population_30_44        5263 non-null   float64
 8   population_female       5263 non-null   float64
 9   population_cs3          5263 non-null   float64
 10  latitude                5263 non-null   float64
 11  longitude               5263 non-null   float64
 12  vote_macron_share       5263 non-null   float64
 13  Ville                   100 non-null    object 
 14  Chomage                 100 non-null    

# KNN weighted interpolation

In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler


In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

df_small = pd.read_csv('../data/processed/dataset_small_rich.csv')  # (100, 68)
df_large = pd.read_csv('../data/processed/dataset_large_rich.csv')  # (5263, 68)


ville_large = df_large['Ville'].copy()


df_large = df_large.drop(columns=['Ville'])
df_small = df_small.drop(columns=['Ville'])

BASE_FEATURES = df_large.columns[2:13]
TARGET_FEATURES = df_large.columns[13:]


for df in (df_small, df_large):
    df[BASE_FEATURES]   = df[BASE_FEATURES].apply(pd.to_numeric, errors='coerce')
    df[TARGET_FEATURES] = df[TARGET_FEATURES].apply(pd.to_numeric, errors='coerce')

X_ref  = df_small[BASE_FEATURES].to_numpy(dtype=float)
y_ref  = df_small[TARGET_FEATURES].to_numpy(dtype=float)

fill_mask = df_large[TARGET_FEATURES].isnull().any(axis=1)
X_fill    = df_large.loc[fill_mask, BASE_FEATURES].to_numpy(dtype=float)

scaler = StandardScaler().fit(X_ref)
X_ref_scaled  = scaler.transform(X_ref)
X_fill_scaled = scaler.transform(X_fill)

def adaptive_knn_fill(X_ref, y_ref, X_fill, k_range=(3, 10)):
    filled_data = np.zeros((X_fill.shape[0], y_ref.shape[1]), dtype=float)
    max_k = max(k_range)
    nn = NearestNeighbors(n_neighbors=max_k).fit(X_ref)

    for i, point in enumerate(X_fill):
        dist, idx = nn.kneighbors(point.reshape(1, -1))
        dist = dist[0]; idx = idx[0]

        valid_k = min(max_k, len(idx))
        k = min(3, valid_k) if dist[0] > 2 * np.median(dist) else min(7, valid_k)

        
        w = np.exp(-dist[:k]**2 / (2 * np.median(dist[:k])**2))
        w /= w.sum()
        filled_data[i] = np.sum(w[:, None] * y_ref[idx[:k]], axis=0)
    return filled_data

filled_values = adaptive_knn_fill(X_ref_scaled, y_ref, X_fill_scaled)

df_large.loc[fill_mask, TARGET_FEATURES] = filled_values


df_large.insert(0, 'Ville', ville_large)

df_large.to_csv('../data/processed/dataset_large_rich_filled.csv', index=False)


In [3]:
df_filled = pd.read_csv('../data/processed/dataset_large_rich_filled.csv')

print(df_filled.isnull().sum().sum())  
print(df_filled.shape) 


0
(5263, 68)


In [6]:

print("Non-empty quantity:", df_filled['Ville'].notna().sum())


df_filled['Ville'] = df_filled['Ville'].where(df_filled['Ville'].notna(), "city_name")



df_filled.to_csv('../data/processed/dataset_large_rich_filled_cityname.csv', index=False)


Non-empty quantity: 5263


In [5]:
df_filled = pd.read_csv('../data/processed/dataset_large_rich_filled_cityname.csv')

print(df_filled.isnull().sum().sum())  
print(df_filled.shape) 

0
(5263, 68)
