In [None]:
import numpy as np
import time
import os
import csv
import pandas as pd

from sklearn.preprocessing import StandardScaler
from skfeature.function.similarity_based import lap_score
from skfeature.utility import construct_W

from clusteval import clusteval

import warnings
warnings.filterwarnings('ignore')

In [None]:
start_time = time.time()
scaler = StandardScaler()

Data_train = pd.read_csv('AppML_InitialProject_test_clustering.csv')

Data_train = pd.DataFrame(scaler.fit_transform(Data_train), columns=Data_train.columns)

X = Data_train

# Finding the most important features using the **laplacian score**

In [None]:
kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
W = construct_W.construct_W(X.to_numpy(), **kwargs_W)

laplacian_scores = lap_score.lap_score(X.to_numpy(), W=W)

sorted_indices = np.argsort(laplacian_scores)

top_10_features = Data_train.columns[sorted_indices[:10]].tolist()

print("Top 10 Features based on Laplacian Score:")
print(top_10_features)

X_10 = X[top_10_features]

Top 10 Features based on Laplacian Score:
['pX_E5x7_Lr1', 'pX_ptvarcone40', 'pX_topoetcone20ptCorrection', 'pX_E_Lr2_LowG', 'pX_emins1', 'pX_nCells_Lr1_LowG', 'pX_deltaEta0', 'pX_deltaPhi2', 'pX_etcone30', 'pX_nCells_Lr1_HiG']


# Unsupervised clustering using **DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.66, min_samples=10, n_jobs = -1)
dbscan.fit(X_10)

labels = dbscan.labels_
n_noise_ = list(labels).count(-1)
print(n_noise_)

print('Number of clusters:', np.max(labels) + 1)
print('labels:', labels)

2362
Number of clusters: 6
labels: [0 0 0 ... 1 0 0]


In [215]:
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

Elapsed time: 64.43825674057007 seconds


# Saving (set to False)

In [None]:
folder_name = 'solutions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

Write = False
if Write:
    variables = top_10_features

    csv_file_path = os.path.join(folder_name, 'Clustering_DBSCAN_VariableList.csv')

    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for variable in variables:
            writer.writerow([variable])
    
    data = np.array(list(labels)).astype(float)
    csv_file_path = os.path.join(folder_name, 'Clustering_DBSCAN.csv')
    
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for index, item in enumerate(data, start=0):
            writer.writerow([index, item])