In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import gc

# Configure matplotlib to reduce memory usage
# plt.ioff()  # Turn off interactive mode
plt.rcParams['figure.max_open_warning'] = 0  # Disable figure limit warning

# Dataset configuration - adjust paths to your dataset
DATASETS = {
    'audio_deep': '/kaggle/input/fi-v2-test-val-data/Feature Dataset 2/audio_deep_features.csv',
    'audio_hc': '/kaggle/input/fi-v2-test-val-data/Feature Dataset 2/audio_hc_features.csv',
    'video_deep': '/kaggle/input/fi-v2-test-val-data/Feature Dataset 2/video_deep_features.csv',
    'video_hc': '/kaggle/input/fi-v2-test-val-data/Feature Dataset 2/video_hc_features.csv',
    'text_deep': '/kaggle/input/fi-v2-test-val-data/Feature Dataset 2/text_deep_features.csv',
    'text_hc': '/kaggle/input/fi-v2-test-val-data/text_hc_features.csv'
}

TRAIT_COLUMNS = ['openness', 'conscientiousness', 'extraversion',
                 'agreeableness', 'neuroticism', 'interview']


### V 1

In [2]:

# def process_dataset(name, path):
#     """Process a single dataset with memory optimization and sorted clustering"""
#     print(f"\n🔁 Processing {name.replace('_', ' ').title()} dataset...")

#     # Load data with optimized types
#     df = pd.read_csv(path)
#     df = df.fillna(0)
#     for col in TRAIT_COLUMNS:
#         if col in df.columns:
#             df[col] = df[col].astype(np.float32)

#     # Process one trait at a time
#     for trait in TRAIT_COLUMNS:
#         if trait in df.columns:
#             print(f"  ⚙️ Clustering {trait}...")

#             # Extract and standardize feature
#             X = df[[trait]].values
#             X_scaled = StandardScaler().fit_transform(X)

#             # Fit GMM and predict clusters
#             gmm = GaussianMixture(n_components=3, random_state=42)
#             df[f'{trait}_cluster'] = gmm.fit_predict(X_scaled)

#             # Reorder clusters based on mean trait score
#             cluster_means = df.groupby(f'{trait}_cluster')[trait].mean().sort_values()
#             cluster_mapping = {old: new for new, old in enumerate(cluster_means.index)}
#             df[f'{trait}_cluster'] = df[f'{trait}_cluster'].map(cluster_mapping)

#             # Clean up intermediate variables
#             del X, X_scaled, cluster_means, cluster_mapping
#             gc.collect()

#     # Save results
#     output_path = f'/kaggle/working/{name}_clustered.csv'
#     df.to_csv(output_path, index=False)
#     print(f"✅ Saved clustered data to {output_path}")

#     # Visualize each trait separately using original values
#     for trait in TRAIT_COLUMNS:
#         if trait in df.columns:
#             fig, ax = plt.subplots(figsize=(12, 6))

#             scatter = ax.scatter(df.index, df[trait],
#                                  c=df[f'{trait}_cluster'], cmap='viridis',
#                                  s=30, alpha=0.7, edgecolors='black')

#             ax.set_title(f"{name.title()} - {trait.title()} Clustering (Original Trait Values)", fontsize=14)
#             ax.set_xlabel("Sample Index", fontsize=12)
#             ax.set_ylabel(f"{trait.title()} Score", fontsize=12)  # Show original trait name
#             ax.grid(True, alpha=0.3)

#             plt.colorbar(scatter, ax=ax, label="Cluster Label")
#             plt.tight_layout()
#             plt.show()
#             plt.close(fig)

#             # Memory cleanup
#             del fig, ax, scatter
#             gc.collect()

#     # Final cleanup
#     del df
#     gc.collect()
#     print("="*80 + "\n")

# # Process datasets sequentially
# for name, path in DATASETS.items():
#     process_dataset(name, path)


### V 2

In [3]:
def process_dataset(name, path):
    """Process a single dataset with memory optimization and sorted clustering"""
    print(f"\n🔁 Processing {name.replace('_', ' ').title()} dataset...")

    # Load data with optimized types (preserve all original columns)
    df = pd.read_csv(path)
    df = df.fillna(0)
    
    # Only modify trait columns (keep other columns as-is)
    for col in TRAIT_COLUMNS:
        if col in df.columns:
            df[col] = df[col].astype(np.float32)

    # Process one trait at a time
    for trait in TRAIT_COLUMNS:
        if trait in df.columns:
            print(f"  ⚙️ Clustering {trait}...")

            # Preserve original trait column
            original_trait = df[trait].copy()
            
            # Extract and standardize feature
            X = df[[trait]].values
            X_scaled = StandardScaler().fit_transform(X)

            # Fit GMM and predict clusters
            gmm = GaussianMixture(n_components=3, random_state=42)
            df[f'{trait}_cluster'] = gmm.fit_predict(X_scaled)

            # Reorder clusters based on mean trait score
            cluster_means = df.groupby(f'{trait}_cluster')[trait].mean().sort_values()
            cluster_mapping = {old: new for new, old in enumerate(cluster_means.index)}
            df[f'{trait}_cluster'] = df[f'{trait}_cluster'].map(cluster_mapping)

            # Restore original trait values (in case scaling affected them)
            df[trait] = original_trait

            # Clean up intermediate variables
            del X, X_scaled, cluster_means, cluster_mapping, original_trait
            gc.collect()

    # Save results with all original + cluster columns
    output_path = f'/kaggle/working/{name}_clustered.csv'
    df.to_csv(output_path, index=False)
    print(f"✅ Saved clustered data to {output_path}")

    # Visualization remains unchanged
    for trait in TRAIT_COLUMNS:
        if trait in df.columns:
            fig, ax = plt.subplots(figsize=(12, 6))

            scatter = ax.scatter(df.index, df[trait],
                                 c=df[f'{trait}_cluster'], cmap='viridis',
                                 s=30, alpha=0.7, edgecolors='black')

            ax.set_title(f"{name.title()} - {trait.title()} Clustering (Original Trait Values)", fontsize=14)
            ax.set_xlabel("Sample Index", fontsize=12)
            ax.set_ylabel(f"{trait.title()} Score", fontsize=12)
            ax.grid(True, alpha=0.3)

            plt.colorbar(scatter, ax=ax, label="Cluster Label")
            plt.tight_layout()
            plt.show()
            plt.close(fig)

            del fig, ax, scatter
            gc.collect()

    # Final cleanup
    del df
    gc.collect()
    print("="*80 + "\n")



    # Process datasets sequentially
for name, path in DATASETS.items():
    process_dataset(name, path)

In [7]:
# import shutil
# import os

# # List of files to move
# files_to_move = ['file1.txt', 'file2.txt', 'file3.txt']
# destination_folder = 'path/to/destination/'

# # Create destination folder if needed
# os.makedirs(destination_folder, exist_ok=True)

# # Move each file
# for file in files_to_move:
#     shutil.move(file, destination_folder)

In [9]:
import shutil

# Create a ZIP archive
shutil.make_archive(
    base_name='Features_dataset_clustered',  # Name without extension
    format='zip',                  # Can be 'zip', 'tar', 'gztar', 'bztar', 'xztar'
    root_dir='/kaggle/working/Features_dataset_clustered',      # Folder to compress
)

'/kaggle/working/Features_dataset_clustered.zip'