In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
from pyclustering.cluster.clique import clique
from pyclustering.cluster.clique import clique_visualizer

# Load datasets
data_df = pd.read_csv('/Users/muskan/deeplearning/bda/datadna.csv')
label_df = pd.read_csv('/Users/muskan/deeplearning/bda/labels.csv')

print("The Data Set has 2 parts")
print("Data.csv")
print("    Shape: ", data_df.shape)
print("    Columns: ", data_df.columns)
print("Label.csv")
print("    Shape: ", label_df.shape)
print("    Columns: ", label_df.columns)

# Drop unwanted column if exists
if 'Unnamed: 0' in data_df.columns:
    data_df = data_df.drop(columns=['Unnamed: 0'])
if 'Unnamed: 0' in label_df.columns:
    label_df = label_df.drop(columns=['Unnamed: 0'])

# Normalize the data for CLIQUE
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data_df)

# Parameters
num_iterations = 50  # number of random 2D projections
n_genes = data_df.shape[1]
gene_indices = list(range(n_genes))
genes_in_clusters_across_dims = []

# Run CLIQUE for multiple 2D projections
for _ in range(num_iterations):
    gene_pair = random.sample(gene_indices, 2)
    data_2d = data_normalized[:, gene_pair]

    # CLIQUE parameters: (data, number of intervals per dimension, density threshold)
    clique_instance = clique(data_2d.tolist(), 10, 2, ccore=False)
    clique_instance.process()
    clusters = clique_instance.get_clusters()

    # Store genes if any cluster is valid
    if any(len(cluster) > 1 for cluster in clusters):
        genes_in_clusters_across_dims.append(set(gene_pair))

# Find key genes across dimensions
if genes_in_clusters_across_dims:
    key_genes_indices = set.intersection(*genes_in_clusters_across_dims)
    key_genes = [data_df.columns[i] for i in key_genes_indices]
    print(f"Identified {len(key_genes)} key genes.")
    print("Key Genes:", key_genes)

    # Save key genes to CSV
    pd.Series(key_genes).to_csv("/Users/muskan/deeplearning/bda/key_genes.csv", index=False)
else:
    print("No overlapping genes found across dimensions.")

The Data Set has 2 parts
Data.csv
    Shape:  (801, 20532)
    Columns:  Index(['Unnamed: 0', 'gene_0', 'gene_1', 'gene_2', 'gene_3', 'gene_4',
       'gene_5', 'gene_6', 'gene_7', 'gene_8',
       ...
       'gene_20521', 'gene_20522', 'gene_20523', 'gene_20524', 'gene_20525',
       'gene_20526', 'gene_20527', 'gene_20528', 'gene_20529', 'gene_20530'],
      dtype='object', length=20532)
Label.csv
    Shape:  (801, 2)
    Columns:  Index(['Unnamed: 0', 'Class'], dtype='object')
Identified 0 key genes.
Key Genes: []
