In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import networkx as nx
from community import community_louvain

# Load dataset
df = pd.read_csv('creditcard.csv')

# Separate features and target variable
X = df.drop(['Class'], axis=1)
y = df['Class']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Build KNN graph from training data
k_neighbors = 10
nn = NearestNeighbors(n_neighbors=k_neighbors, metric='cosine').fit(X_train)
distances, indices = nn.kneighbors(X_train)

# Initialize a graph
G = nx.Graph()

# Add edges based on nearest neighbors in training data
for i, neighbors in enumerate(indices):
    for j, dist in zip(neighbors, distances[i]):
        if i != j:  # Avoid self-loops
            G.add_edge(i, j, weight=1 - dist)  # Convert cosine distance to similarity

# Apply Louvain clustering on the training graph
partition = community_louvain.best_partition(G)

# Map partition dictionary to a list aligned with training data indices
train_clusters = np.array([partition[i] for i in range(len(X_train))])


In [None]:
# Compute cluster prototypes (mean vectors of clusters)
cluster_ids = np.unique(train_clusters)
cluster_prototypes = {}

for cluster_id in cluster_ids:
    cluster_members = X_train[train_clusters == cluster_id]
    cluster_prototype = cluster_members.mean(axis=0)
    cluster_prototypes[cluster_id] = cluster_prototype


In [None]:
def compute_cluster_distances(X, cluster_prototypes):
    distances = []
    for x in X:
        dists = [np.linalg.norm(x - cluster_prototypes[cluster_id]) for cluster_id in cluster_prototypes]
        distances.append(dists)
    return np.array(distances)

# Compute distances for training data
train_cluster_distances = compute_cluster_distances(X_train, cluster_prototypes)

# Compute distances for test data
test_cluster_distances = compute_cluster_distances(X_test, cluster_prototypes)


In [None]:
# Combine original features with cluster distance features
X_train_with_features = np.hstack((X_train, train_cluster_distances))
X_test_with_features = np.hstack((X_test, test_cluster_distances))
