# Clustering

In [32]:
import pandas as pd
from sklearn.mixture import GaussianMixture
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [36]:
# Read CSV files into DataFrames
df_mi_corr = pd.read_csv("../processed_data/lazy_corr_MI_10_FS.csv")
df_mi_corr_50 = pd.read_csv("../processed_data/lazy_corr_MI_50_FS.csv")
df_corr = pd.read_csv("../processed_data/lazy_corr_FS.csv")
df_mi = pd.read_csv("../processed_data/batch_corr_MI_10_FS.csv")
df = pd.read_csv("../processed_data/batch_corr_FS.csv")

In [40]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df_mi)

# Fit a Gaussian Mixture Model
gmm = GaussianMixture(n_components=5, random_state=42)  # Assuming 3 clusters
gmm.fit(features_scaled)

# Predict cluster labels
cluster_labels = gmm.predict(features_scaled)
df_mi['cluster'] = cluster_labels


In [45]:
# Evaluate the model using BIC and AIC
bic = gmm.bic(features_scaled)
aic = gmm.aic(features_scaled)

print(f"BIC: {bic}")
print(f"AIC: {aic}")


BIC: -1198.6824501259393
AIC: -2718.26926907167


In [46]:
# Visualize the clustering results in 3D
fig = px.scatter_3d(
    df_mi, 
    x=features_scaled[:, 0], 
    y=features_scaled[:, 1], 
    z=features_scaled[:, 2], 
    color='cluster',
    title='GMM Clustering Results in 3D',
    labels={'x': 'Feature 1', 'y': 'Feature 2', 'z': 'Feature 3'}
)

# Show the plot
fig.show()


In [50]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
import plotly.express as px

def tune_gmm(data, n_components_range=range(1, 15), criterion='AIC'):
    """
    Perform hyperparameter tuning for a Gaussian Mixture Model (GMM),
    store the results in a DataFrame, visualize the results, and return the best model.

    Parameters:
    - data: pd.DataFrame - The input data for GMM clustering.
    - n_components_range: range - The range of number of components to try.
    - criterion: str - The criterion to use for selecting the best model ('AIC' or 'BIC').

    Returns:
    - best_model: GaussianMixture - The best GMM model based on the chosen criterion.
    - results_df: pd.DataFrame - The DataFrame containing AIC and BIC values for each model.
    """
    
    # Drop the target column if it's present
    if 'target' in data.columns:
        data = data.drop('target', axis=1)

    # Standardize the features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(data)

    # Create lists to store the results
    aic_values = []
    bic_values = []
    n_components_list = []

    # Loop over the range of components
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, random_state=42)
        gmm.fit(features_scaled)
        
        aic = gmm.aic(features_scaled)
        bic = gmm.bic(features_scaled)
        
        aic_values.append(aic)
        bic_values.append(bic)
        n_components_list.append(n_components)

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({
        'n_components': n_components_list,
        'AIC': aic_values,
        'BIC': bic_values
    })

    # Plot AIC and BIC values
    fig = px.line(results_df, x='n_components', y=['AIC', 'BIC'], 
                  title='AIC and BIC values for different number of components',
                  labels={'value': 'Score', 'n_components': 'Number of Components'},
                  markers=True)
    fig.show()

    # Find the best model based on the chosen criterion
    if criterion == 'AIC':
        best_index = results_df['AIC'].idxmin()
    elif criterion == 'BIC':
        best_index = results_df['BIC'].idxmin()
    else:
        raise ValueError("Criterion must be either 'AIC' or 'BIC'")

    best_model = GaussianMixture(n_components=results_df.loc[best_index, 'n_components'], random_state=42)
    best_model.fit(features_scaled)

    return best_model, results_df


In [51]:
best_gmm_model, tuning_results = tune_gmm(df_mi, n_components_range=range(1, 15), criterion='AIC')
print(best_gmm_model)

GaussianMixture(n_components=11, random_state=42)
