In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Read the cleaned data
apps = pd.read_csv('data/apps_all_background.csv')

# Extract the features for KNN
knn_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'credit_annuity_ratio']
X = apps[knn_features].copy()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize KNN with 501 neighbors (500 + itself)
knn = NearestNeighbors(n_neighbors=501, metric='euclidean', n_jobs=-1)
knn.fit(X_scaled)

# Find neighbors for all points
distances, indices = knn.kneighbors(X_scaled)

# Initialize column
neighbors_target_mean_500 = np.zeros(len(apps))

for i in range(len(apps)):
    # Get the indices of the 500 nearest neighbors (excluding itself at index 0)
    neighbor_indices = indices[i, 1:]  # Skip the first one (itself)
    
    # Get the TARGET values of these neighbors
    neighbor_targets = apps.iloc[neighbor_indices]['TARGET'].values
    
    # Calculate the mean
    neighbors_target_mean_500[i] = neighbor_targets.mean()

# Add the new feature to the dataframe
apps['neighbors_target_mean_500'] = neighbors_target_mean_500
apps_knn = apps[['SK_ID_CURR', 'neighbors_target_mean_500']]

apps_knn.to_csv('data/apps_knn.csv', index=False)

In [2]:
print("\nKNN feature statistics:")
print(apps['neighbors_target_mean_500'].describe())

# Show correlation with TARGET
print(f"\nCorrelation with TARGET: {apps['neighbors_target_mean_500'].corr(apps['TARGET']):.4f}")

# Display sample of results
print("\nSample of data with new feature:")
print(apps[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 
            'credit_annuity_ratio', 'neighbors_target_mean_500']].head(10))


KNN feature statistics:
count    307511.000000
mean          0.080583
std           0.064068
min           0.000000
25%           0.034000
50%           0.060000
75%           0.106000
max           0.456000
Name: neighbors_target_mean_500, dtype: float64

Correlation with TARGET: 0.2438

Sample of data with new feature:
   TARGET  EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  credit_annuity_ratio  \
0       0      0.691403      0.790829      0.591977             20.721897   
1       0      0.571890      0.776489      0.367291             26.354558   
2       0      0.636020      0.472090      0.676993             34.061050   
3       0      0.358775      0.358775      0.358775             15.279149   
4       1      0.122237      0.384375      0.629674             20.559211   
5       0      0.390882      0.572610      0.209155             36.364166   
6       0      0.716292      0.716292      0.716292             15.649736   
7       0      0.721928      0.792596      0.651260         