<a href="https://colab.research.google.com/github/Narenpradhan/Pulse-Diagnosis-System/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas scikit-learn xgboost sdv

Collecting sdv
  Downloading sdv-1.29.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.74-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.74-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.24.0-py3-none-any.whl.metadata (9.3 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## V1 - OLD NOT ACCURATE - ROTE LEARNING
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- Function to load, preprocess, train, and evaluate ---
def train_and_evaluate(data, le, description):
    """
    A helper function to run the full pipeline on a given dataset.
    """
    print(f"\n--- Starting {description} ---")

    # 1. Separate Features (X) and Target (y)
    X = data.drop(columns=['Dosha'])
    y = data['Dosha']

    # 2. Preprocess Features (One-Hot Encoding)
    # We fit a new encoder for each dataset (baseline vs. combined)
    # as the combined one might have slightly different categories.
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_encoded = ohe.fit_transform(X)

    # 3. Preprocess Target (Label Encoding)
    # We use the *same* LabelEncoder 'le' fit on the real data
    # to ensure class numbers (e.g., 'Pitta' -> 1) are consistent.
    y_encoded = le.transform(y)

    print(f"Processed features shape: {X_encoded.shape}")

    # 4. Split Data (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y_encoded,
        test_size=0.2,
        random_state=42,
        stratify=y_encoded # Ensures balanced classes in train/test
    )
    print(f"Training features: {X_train.shape}, Testing features: {X_test.shape}")

    # 5. Initialize and Train XGBoost Model
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(le.classes_), # Auto-detect number of classes
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )

    print("Training model...")
    model.fit(X_train, y_train)
    print("Training complete.")

    # 6. Evaluate Model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\n--- {description} Results ---")
    print(f"Overall Model Accuracy: {accuracy * 100:.2f}%")

    # For the final combined model, print the full report
    if "Combined" in description:
        print("\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred, target_names=le.classes_))

    return accuracy

# --- Main Execution ---
try:
    # Load real data
    # We use dtype=str to prevent the same TypeError as before
    real_data = pd.read_csv('/content/drive/MyDrive/Final Year Project/Updated_Prakriti_With_Features.csv', dtype=str)
    print(f"Successfully loaded 'Updated_Prakriti_With_Features.csv' ({len(real_data)} rows)")

    # Load synthetic data
    synthetic_data = pd.read_csv('/content/drive/MyDrive/Final Year Project/synthetic_prakriti_data_5000.csv', dtype=str)
    print(f"Successfully loaded 'synthetic_prakriti_data_5000.csv' ({len(synthetic_data)} rows)")

except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure both CSV files are present.")
    exit()

# --- Baseline Model (Real Data Only) ---
# First, create and fit the LabelEncoder on the REAL data's target
le = LabelEncoder()
le.fit(real_data['Dosha'])

# Now, run the baseline test
baseline_accuracy = train_and_evaluate(real_data, le, "Baseline Model (1200 Real Rows)")

# --- Combined Model (Real + Synthetic Data) ---
# 1. Combine the datasets
combined_df = pd.concat([real_data, synthetic_data], ignore_index=True)
print(f"\nSuccessfully combined data. New shape: {combined_df.shape}")

# 2. Run the full pipeline on the new combined data
new_accuracy = train_and_evaluate(combined_df, le, "Combined Model (6200 Rows)")

# --- Final Comparison ---
print("\n--- üèÜ Final Comparison ---")
print(f"Baseline Accuracy (1200 rows): {baseline_accuracy * 100:.2f}%")
print(f"New Accuracy (6200 rows):   {new_accuracy * 100:.2f}%")

Successfully loaded 'Updated_Prakriti_With_Features.csv' (1200 rows)
Successfully loaded 'synthetic_prakriti_data_5000.csv' (5000 rows)

--- Starting Baseline Model (1200 Real Rows) ---
Processed features shape: (1200, 87)
Training features: (960, 87), Testing features: (240, 87)
Training model...
Training complete.

--- Baseline Model (1200 Real Rows) Results ---
Overall Model Accuracy: 100.00%

Successfully combined data. New shape: (6200, 30)

--- Starting Combined Model (6200 Rows) ---
Processed features shape: (6200, 87)
Training features: (4960, 87), Testing features: (1240, 87)
Training model...
Training complete.

--- Combined Model (6200 Rows) Results ---
Overall Model Accuracy: 61.37%

Detailed Classification Report:
              precision    recall  f1-score   support

       Kapha       0.50      0.54      0.52        70
       Pitta       0.48      0.27      0.34       161
        Vata       0.59      0.58      0.59       238
 pitta+kapha       0.34      0.18      0.24   

In [6]:
## V2
import pandas as pd
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import warnings
import time
import joblib
import os # <-- New import to check if file exists

# Suppress all warnings for a clean output
warnings.filterwarnings('ignore')

print("--- Starting Corrected ML Pipeline ---")

# --- Step 1: Load and Split Real Data ---
try:
    real_data = pd.read_csv('/content/drive/MyDrive/Final Year Project/Updated_Prakriti_With_Features.csv', dtype=str)
    print(f"\nSuccessfully loaded real data ({len(real_data)} rows).")
except FileNotFoundError:
    print("\nError: 'Updated_Prakriti_With_Features.csv' not found.")
    exit()

# The "Golden Rule": Split REAL data first.
# We will train the synthesizer on 'real_train_df' and test our final model on 'real_test_df'
real_train_df, real_test_df = train_test_split(
    real_data,
    test_size=0.2,  # 20% held back for final, real testing
    random_state=42,
    stratify=real_data['Dosha'] # Ensure classes are balanced in split
)
print(f"Real data split: {len(real_train_df)} for training, {len(real_test_df)} for final testing.")


# --- Step 2: Generate or Load Synthetic Data ---
synthetic_data_filename = '/content/drive/MyDrive/Final Year Project/synthetic_prakriti_data_5000.csv'
num_new_rows = 5000

# --- This is the new logic ---
if os.path.exists(synthetic_data_filename):
    print(f"\nFound existing synthetic data: '{synthetic_data_filename}'. Loading file...")
    synthetic_data = pd.read_csv(synthetic_data_filename, dtype=str)
    print(f"Successfully loaded {len(synthetic_data)} synthetic rows.")
else:
    print(f"\nNo existing synthetic data found. Generating new data...")
    print("Initializing CTGAN synthesizer...")
    synthesizer = CTGAN(
        epochs=300,  # You can increase this to 500 or 1000 for better quality
        batch_size=50,
        verbose=False # Set to True if you want to see training progress
    )

    print("Training synthesizer on real training data (this may take several minutes)...")
    start_time = time.time()
    synthesizer.fit(real_train_df)
    print(f"Synthesizer training complete in {time.time() - start_time:.2f} seconds.")

    print(f"Generating {num_new_rows} new synthetic samples...")
    synthetic_data = synthesizer.sample(num_rows=num_new_rows)

    print(f"Saving new synthetic data to '{synthetic_data_filename}' for future use...")
    synthetic_data.to_csv(synthetic_data_filename, index=False)
# --- End of new logic ---


# --- Step 3: Combine Real Training + Synthetic Data ---
combined_train_df = pd.concat([real_train_df, synthetic_data], ignore_index=True)
print(f"Created new combined training set with {len(combined_train_df)} rows.")


# --- Step 4: Preprocessing (Encoding) ---
print("\nPreprocessing data (One-Hot & Label Encoding)...")
target_column = 'Dosha'

# 1. Label Encode the Target ('Dosha')
le = LabelEncoder()
# Fit *only* on the real training data
le.fit(real_train_df[target_column])

# Transform all our target sets
y_train_combined = le.transform(combined_train_df[target_column])
y_test_final = le.transform(real_test_df[target_column])

# 2. One-Hot Encode the Features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Get feature columns (all columns except 'Dosha')
feature_cols = [col for col in real_data.columns if col != target_column]

# Fit *only* on the combined training data
ohe.fit(combined_train_df[feature_cols])

# Transform all our feature sets
X_train_combined = ohe.transform(combined_train_df[feature_cols])
X_test_final = ohe.transform(real_test_df[feature_cols])

print(f"New training features shape: {X_train_combined.shape}")
print(f"Final test features shape: {X_test_final.shape}")


# --- Step 5: Train the *Regularized* XGBoost Model ---
print("\nInitializing *REGULARIZED* XGBoost model...")
# These parameters are "handcuffs" to prevent overfitting (memorizing)
model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',

    # --- Regularization Parameters ---
    n_estimators=300,       # More trees (to compensate for low learning rate)
    learning_rate=0.05,     # Slows down learning
    max_depth=4,            # Shorter trees (less complex)
    subsample=0.8,          # Use 80% of data for each tree
    colsample_bytree=0.8    # Use 80% of features for each tree
)

print("Training new model on combined data...")
start_time = time.time()
model.fit(X_train_combined, y_train_combined)
print(f"Model training complete in {time.time() - start_time:.2f} seconds.")


# --- Step 6: Test on the "Golden" *Real* Test Set ---
print(f"\n--- üèÜ FINAL RESULTS (Tested on {len(real_test_df)} REAL rows) ---")

y_pred_final = model.predict(X_test_final)
final_accuracy = accuracy_score(y_test_final, y_pred_final)

print(f"\nFinal Model Accuracy: {final_accuracy * 100:.2f}%")

print("\nFinal Classification Report:")
print(classification_report(y_test_final, y_pred_final, target_names=le.classes_))


# --- Step 7: Save Model + Encoders ---
save_dir = "/content/drive/MyDrive/Final Year Project/prakriti_model_assets"
os.makedirs(save_dir, exist_ok=True)

joblib.dump(model, f"{save_dir}/xgb_prakriti_model.pkl")
joblib.dump(le, f"{save_dir}/label_encoder.pkl")
joblib.dump(ohe, f"{save_dir}/onehot_encoder.pkl")

print("\nModels saved successfully:")
print(f"- XGBoost model        ‚Üí {save_dir}/xgb_prakriti_model.pkl")
print(f"- Label Encoder        ‚Üí {save_dir}/label_encoder.pkl")
print(f"- OneHot Encoder       ‚Üí {save_dir}/onehot_encoder.pkl")


--- Starting Corrected ML Pipeline ---

Successfully loaded real data (1200 rows).
Real data split: 960 for training, 240 for final testing.

Found existing synthetic data: '/content/drive/MyDrive/Final Year Project/synthetic_prakriti_data_5000.csv'. Loading file...
Successfully loaded 5000 synthetic rows.
Created new combined training set with 5960 rows.

Preprocessing data (One-Hot & Label Encoding)...
New training features shape: (5960, 87)
Final test features shape: (240, 87)

Initializing *REGULARIZED* XGBoost model...
Training new model on combined data...
Model training complete in 2.09 seconds.

--- üèÜ FINAL RESULTS (Tested on 240 REAL rows) ---

Final Model Accuracy: 97.92%

Final Classification Report:
              precision    recall  f1-score   support

       Kapha       1.00      1.00      1.00        14
       Pitta       1.00      0.97      0.98        29
        Vata       1.00      0.98      0.99        53
 pitta+kapha       1.00      0.90      0.95        10
  vat