In [7]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import os

# --- 1. Configuration ---
# IMPORTANT: Ensure this path is correct and accessible
FILE_PATH = r"C:\Users\ASUS\Desktop\ayurdev.ai\Knowledge_base\Updated_Prakriti_With_Features.csv"
TARGET_COLUMN = 'Dosha' # Assuming the 'Dosha' column is the class label (Vata, Pitta, Kapha, etc.)
RANDOM_SEED = 42

# --- 2. Load and Prepare Data ---
try:
    df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"Error: File not found at the specified path: {FILE_PATH}")
    exit()

# Separate features (X) and target (Y)
X = df.drop(columns=[TARGET_COLUMN])
Y = df[TARGET_COLUMN]

# CatBoost works best when categorical features are explicitly string type
for col in X.columns:
    X[col] = X[col].astype(str)

# --- 3. Identify Categorical Features ---
# All non-target columns are descriptive categories, so all are marked for CatBoost encoding.
categorical_features_indices = list(range(X.shape[1]))

# --- 4. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=RANDOM_SEED, stratify=Y
)

# --- 5. CatBoost Model Training ---
print(f"Starting CatBoost Training on {len(df)} samples...")
print(f"Total Features (Vector Dimension): {X.shape[1]}")
print(f"Target Classes Found: {Y.nunique()} - {Y.unique()}")


model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    loss_function='MultiClass', # Used when the target has more than 2 unique classes
    random_seed=RANDOM_SEED,
    verbose=50, # Print status every 50 iterations
    early_stopping_rounds=50,
    # This instructs CatBoost to use Ordered Target Encoding for the vector embedding:
    cat_features=categorical_features_indices
)

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    plot=False # Set to True if running in a Jupyter environment to see live training plot
)

# --- 6. Evaluation and Insight (Value of the Encoding) ---

# Prediction and Accuracy
y_pred = model.predict(X_test)
# Flatten the predictions array for metric calculation
y_pred_flat = [p[0] for p in y_pred]
accuracy = accuracy_score(y_test, y_pred_flat)

print(f"\n--- CatBoost Model Performance ---")
print(f"Test Accuracy: {accuracy:.4f}")

# Feature Importance: This reveals which internal vector encodings were most predictive.
feature_importance = model.get_feature_importance()
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 Features (Encodings) CatBoost Relied On:")
print(importance_df.head(10))

importance_df.to_csv('catboost_prakriti_feature_importance.csv', index=False)
print("\nResults saved to 'catboost_prakriti_feature_importance.csv'")

ModuleNotFoundError: No module named 'catboost'