In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load datasets
seed_data = pd.read_csv('seed_data.csv')
seed_data = seed_data.iloc[:, :-1]
pool_data = pd.read_csv('pool_data.csv')

# Standardize features
scaler = StandardScaler()
seed_data_scaled = scaler.fit_transform(seed_data)
pool_data_scaled = scaler.transform(pool_data)

In [2]:
from sklearn.cluster import KMeans

# Fit KMeans to the seed data
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans.fit(seed_data_scaled)

# Assign clusters to the seed data
seed_data['cluster'] = kmeans.labels_

In [3]:
from sklearn.neighbors import NearestNeighbors

# Find the nearest neighbors in the pool data for each seed data point
nn = NearestNeighbors(n_neighbors=5, algorithm='auto')
nn.fit(pool_data_scaled)

# Get the nearest neighbors for each cluster centroid
distances, indices = nn.kneighbors(kmeans.cluster_centers_)

# Collect the pool data nearest to the centroids
nearest_pool_customers = pool_data.iloc[indices.flatten()]

In [4]:
# Calculate similarity scores (inverse of distance)
similarity_scores = 1 / (1 + distances.flatten())

# Add similarity scores to the nearest pool customers using .loc
nearest_pool_customers.loc[:, 'similarity_score'] = similarity_scores

# Rank pool customers based on similarity scores
ranked_customers = nearest_pool_customers.sort_values(by='similarity_score', ascending=False)

# Save ranked customers to a new CSV file
ranked_customers.to_csv('ranked_customers_KNN.csv', index=False)

print("Ranked customer predictions saved to 'ranked_customers_KNN.csv'")

Ranked customer predictions saved to 'ranked_customers_KNN.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_pool_customers.loc[:, 'similarity_score'] = similarity_scores
