In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


In [2]:
FOLDER_PATH = 'data'
FILE_NAME = "clean_numeric_dataset.csv"
FULL_PATH = FOLDER_PATH + '/' + FILE_NAME

In [3]:
df = pd.read_csv(FULL_PATH)

In [4]:
df.head()

Unnamed: 0,ra,dec,starthjd,endhjd,vmag,verr,imag,ierr,npts
0,154.1162,-46.5045,2450246.45,2450939.64,18.251,0.014,17.462,0.017,105
1,154.1171,-46.4989,2450246.45,2450940.53,20.216,0.033,18.386,0.022,113
2,154.1172,-46.4909,2450246.45,2450940.53,20.623,0.038,19.206,0.032,110
3,154.1176,-46.4801,2450246.45,2450940.53,19.365,0.018,18.226,0.022,109
4,154.1179,-46.4671,2450246.45,2450939.57,19.535,0.018,18.653,0.025,108


In [7]:
df.shape
df.info()
df.describe()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123830 entries, 0 to 123829
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   ra        123830 non-null  float64
 1   dec       123830 non-null  float64
 2   starthjd  123830 non-null  float64
 3   endhjd    123830 non-null  float64
 4   vmag      123830 non-null  float64
 5   verr      123830 non-null  float64
 6   imag      123830 non-null  float64
 7   ierr      123830 non-null  float64
 8   npts      123830 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 8.5 MB


(123830, 9)

In [8]:
features = ['ra', 'dec', 'starthjd', 'endhjd', 'vmag', 'verr', 'imag', 'ierr', 'npts']
X = df[features]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
def find_optimal_clusters(X_scaled, max_clusters=5, batch_size=1000):
    """Find the optimal number of clusters using the elbow method and silhouette score."""
    # Calculate inertia (within-cluster sum of squares) for different k values
    inertias = []
    silhouette_scores = []
    K = range(1, max_clusters + 1)
    
    print("\nCalculating cluster metrics...")
    print("Using MiniBatchKMeans for faster computation...")
    
    # First calculate inertias for all k values using MiniBatchKMeans
    for k in K:
        print(f"Processing k={k} (inertia calculation)...")
        kmeans = MiniBatchKMeans(n_clusters=k, 
                                batch_size=batch_size,
                                random_state=42,
                                n_init=3)  # Reduced n_init for speed
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
    
    # Then calculate silhouette scores (only for k > 1)
    print("\nCalculating silhouette scores...")
    # Use a subset of data for silhouette score calculation
    sample_size = min(10000, len(X_scaled))
    X_sample = X_scaled[np.random.choice(len(X_scaled), sample_size, replace=False)]
    
    for k in range(2, max_clusters + 1):
        print(f"Processing k={k} (silhouette calculation)...")
        kmeans = MiniBatchKMeans(n_clusters=k, 
                                batch_size=batch_size,
                                random_state=42,
                                n_init=3)
        labels = kmeans.fit_predict(X_sample)
        score = silhouette_score(X_sample, labels)
        silhouette_scores.append(score)
    
    # Plot elbow curve and silhouette scores
    print("\nGenerating plots...")
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(K, inertias, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    
    plt.subplot(1, 2, 2)
    plt.plot(range(2, max_clusters + 1), silhouette_scores, 'rx-')
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis')
    
    plt.tight_layout()
    plt.savefig('cluster_analysis.png')
    plt.close()
    
    print("Cluster analysis plots saved to 'cluster_analysis.png'")
    
    return inertias, silhouette_scores

In [21]:
print("\nStep 2: Finding optimal number of clusters...")
find_optimal_clusters(X_scaled)


Step 2: Finding optimal number of clusters...

Calculating cluster metrics...
Using MiniBatchKMeans for faster computation...
Processing k=1 (inertia calculation)...
Processing k=2 (inertia calculation)...
Processing k=3 (inertia calculation)...
Processing k=4 (inertia calculation)...
Processing k=5 (inertia calculation)...

Calculating silhouette scores...
Processing k=2 (silhouette calculation)...
Processing k=3 (silhouette calculation)...
Processing k=4 (silhouette calculation)...
Processing k=5 (silhouette calculation)...

Generating plots...
Cluster analysis plots saved to 'cluster_analysis.png'


([1114543.3802649814,
  677704.2829092985,
  526315.3850701778,
  381149.3068163104,
  308828.23828633985],
 [np.float64(0.4250738673892818),
  np.float64(0.3763281376382265),
  np.float64(0.4048933797767197),
  np.float64(0.3712100425242607)])