In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/doshas-for-vata-pitta-and-kapha/ayurvedic_dosha_dataset (1).csv")   # change to your actual file name

print(df.columns.tolist())

print("Total Columns:", len(df.columns))
print("Shape:", df.shape)

['Body Frame', 'Type of Hair', 'Color of Hair', 'Skin', 'Complexion', 'Body Weight', 'Nails', 'Size and Color of the Teeth', 'Pace of Performing Work', 'Mental Activity', 'Memory', 'Sleep Pattern', 'Weather Conditions', 'Reaction under Adverse Situations', 'Mood', 'Eating Habit', 'Hunger', 'Body Temperature', 'Joints', 'Nature', 'Body Energy', 'Quality of Voice', 'Dreams', 'Social Relations', 'Body Odor', 'Dosha']
Total Columns: 26
Shape: (5000, 26)


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pickle

# ✅ Load Dataset
file_path = "/kaggle/input/doshas-for-vata-pitta-and-kapha/ayurvedic_dosha_dataset (1).csv"
df = pd.read_csv(file_path)

print("✅ Data Loaded Successfully")
print(df.head())
print(df['Dosha'].value_counts())

# ✅ Encode Categorical Features
label_encoders = {}
for col in df.columns:
    if col != 'Dosha':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# ✅ Encode Target
target_le = LabelEncoder()
df['Dosha'] = target_le.fit_transform(df['Dosha'])

# ✅ Train-Test Split
X = df.drop('Dosha', axis=1)
y = df['Dosha']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ✅ Class Imbalance Handling
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: w for i, w in enumerate(class_weights)}

# ✅ Hyperparameter Search - RandomForest
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, class_weight=class_weights_dict)

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

rf_random.fit(X_train, y_train)
best_rf = rf_random.best_estimator_

# ✅ Model Evaluation
y_pred = best_rf.predict(X_test)

print("\n✅ Best Parameters:", rf_random.best_params_)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))

# ✅ Save Model as .pkl
model_package = {
    "model": best_rf,
    "label_encoders": label_encoders,
    "target_encoder": target_le
}

save_path = "dosha_rf_model.pkl"
with open(save_path, "wb") as f:
    pickle.dump(model_package, f)

print("\n📌 Saved Model as:", save_path)
print("👉 Download from the file browser on the left in Kaggle")

# ✅ Prediction Function (for later use anywhere)
def predict_dosha(user_input: dict):
    input_df = pd.DataFrame([user_input])
    for col, le in label_encoders.items():
        input_df[col] = le.transform(input_df[col].astype(str))
    pred = best_rf.predict(input_df)[0]
    return target_le.inverse_transform([pred])[0]


✅ Data Loaded Successfully
      Body Frame Type of Hair Color of Hair           Skin Complexion  \
0     Well Built          Dry          Grey  Soft,Sweating    Pinkish   
1  Thin and Lean       Normal         Brown   Moist,Greasy       Dark   
2     Well Built          Dry         Brown      Dry,Rough       Dark   
3  Thin and Lean       Greasy         Brown  Soft,Sweating    Pinkish   
4     Well Built       Normal         Black  Soft,Sweating    Pinkish   

   Body Weight     Nails Size and Color of the Teeth Pace of Performing Work  \
0  Underweight    Redish                 Large,White                    Fast   
1   Overweight  Blackish            Medium,Yellowish                  Medium   
2  Underweight   Pinkish            Medium,Yellowish                    Fast   
3   Overweight   Pinkish          Irregular,Blackish                    Slow   
4       Normal  Blackish            Medium,Yellowish                    Fast   

  Mental Activity  ...            Hunger  Body Temper

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(



✅ Best Parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}

✅ Accuracy: 0.833

✅ Confusion Matrix:
 [[394  30  17]
 [ 27 340   0]
 [ 87   6  99]]

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.89      0.83       441
           1       0.90      0.93      0.92       367
           2       0.85      0.52      0.64       192

    accuracy                           0.83      1000
   macro avg       0.84      0.78      0.80      1000
weighted avg       0.84      0.83      0.83      1000


📌 Saved Model as: dosha_rf_model.pkl
👉 Download from the file browser on the left in Kaggle


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

df = pd.read_csv("/kaggle/input/doshas-for-vata-pitta-and-kapha/ayurvedic_dosha_dataset (1).csv")
df = df.drop(columns=['Body Energy', 'Quality of Voice', 'Dreams', 'Body Odor'])

# --- Encode categorical features ---
label_encoders = {}
for col in df.columns:
    if col != 'Dosha':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# Encode target
target_le = LabelEncoder()
df['Dosha'] = target_le.fit_transform(df['Dosha'])

# --- Split data ---
X = df.drop('Dosha', axis=1)
y = df['Dosha']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Handle class imbalance using class weights ---
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: w for i, w in enumerate(class_weights)}

# --- Hyperparameter tuning for RandomForest ---
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, class_weight=class_weights_dict)

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

rf_random.fit(X_train, y_train)

# --- Best model ---
best_rf = rf_random.best_estimator_

# --- Evaluate ---
y_pred = best_rf.predict(X_test)

print("Best Parameters:", rf_random.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ✅ Save Model as .pkl
model_package = {
    "model": best_rf,
    "label_encoders": label_encoders,
    "target_encoder": target_le
}

save_path = "dosha_rf_model2.pkl"
with open(save_path, "wb") as f:
    pickle.dump(model_package, f)

print("\n📌 Saved Model as:", save_path)
print("👉 Download from the file browser on the left in Kaggle")


# --- Predict Dosha from user input ---
def predict_dosha(user_input):
    input_df = pd.DataFrame([user_input])
    for col, le in label_encoders.items():
        input_df[col] = le.transform(input_df[col].astype(str))
    pred_encoded = best_rf.predict(input_df)[0]
    pred_dosha = target_le.inverse_transform([pred_encoded])[0]
    return pred_dosha


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20}

Accuracy: 0.623

Confusion Matrix:
[[310 104  27]
 [ 98 251  18]
 [ 89  41  62]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.70      0.66       441
           1       0.63      0.68      0.66       367
           2       0.58      0.32      0.41       192

    accuracy                           0.62      1000
   macro avg       0.61      0.57      0.58      1000
weighted avg       0.62      0.62      0.61      1000


📌 Saved Model as: dosha_rf_model2.pkl
👉 Download from the file browser on the left in Kaggle
[CV] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, max_features=log2, min_sa