In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib 
import os 

# Define the path to your new encoded CSV file which includes geometric features and V-A
encoded_csv_path = 'processed_faces_newFinal/encoded_landmarks_with_va_geometric.csv'
output_dir = 'processed_faces_newFinal' # Define output directory for split data

try:
    # Load the encoded data
    df_encoded = pd.read_csv(encoded_csv_path)
    print(f"Successfully loaded data from: {encoded_csv_path}")
    print("DataFrame head (first 5 rows with new features):")
    print(df_encoded.head())
    print(f"\nDataFrame columns (expecting raw landmarks, geometric, valence, arousal, emotion_encoded):")
    print(df_encoded.columns.tolist())


    # Separate features (X) and target (y)
    # X will contain all landmark coordinates, geometric features, valence, and arousal
    # y will contain the encoded 'emotion' label
    # We drop 'emotion' (original text label) to prevent data leakage and use emotion_encoded
    X = df_encoded.drop(['emotion', 'emotion_encoded'], axis=1) # Drop original emotion and the encoded target
    y = df_encoded['emotion_encoded']              # Select emotion_encoded as the target

    print(f"\nFeatures (X) shape: {X.shape}")
    print(f"Target (y) shape: {y.shape}")

    # Perform the train-test split
    # test_size=0.20 means 20% of the data will be used for testing, 80% for training
    # random_state ensures reproducibility of the split
    # stratify=y ensures that the proportion of each emotion class is the same in both sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

    print("\nData split successfully!")
    print(f"X_train shape (training features): {X_train.shape}")
    print(f"X_test shape (testing features): {X_test.shape}")
    print(f"y_train shape (training labels): {y_train.shape}")
    print(f"y_test shape (testing labels): {y_test.shape}")

    # --- Save the split datasets ---
    # Using joblib.dump is efficient for saving large arrays/DataFrames
    joblib.dump(X_train, os.path.join(output_dir, 'X_train_geometric.joblib')) # New filename
    joblib.dump(X_test, os.path.join(output_dir, 'X_test_geometric.joblib'))   # New filename
    joblib.dump(y_train, os.path.join(output_dir, 'y_train_geometric.joblib')) # New filename
    joblib.dump(y_test, os.path.join(output_dir, 'y_test_geometric.joblib'))   # New filename
    
    print(f"\n✅ Split datasets (with geometric features) saved to {output_dir}/")
    print(f"   as X_train_geometric.joblib, X_test_geometric.joblib, y_train_geometric.joblib, y_test_geometric.joblib")

except FileNotFoundError:
    print(f"Error: The file '{encoded_csv_path}' was not found.")
    print("Please ensure 'process_emotions.py' completed successfully and the CSV file exists at the specified path.")
except Exception as e:
    print(f"An error occurred: {e}")



Successfully loaded data from: processed_faces_newFinal/encoded_landmarks_with_va_geometric.csv
DataFrame head (first 5 rows with new features):
  emotion  landmark_0_x  landmark_0_y  landmark_1_x  landmark_1_y  \
0   anger            61           258            64           319   
1   anger            56           264            60           325   
2   anger            67           258            69           318   
3   anger             3           227            11           295   
4   anger             7           227            16           293   

   landmark_2_x  landmark_2_y  landmark_3_x  landmark_3_y  landmark_4_x  ...  \
0            70           375            83           428           104  ...   
1            66           382            80           436           103  ...   
2            75           374            87           428           108  ...   
3            25           359            42           421            67  ...   
4            29           356           