In [1]:
pip install --upgrade scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [51]:
df = pd.read_csv("datasets/Training.csv")   # adjust path if needed
print("Shape:", df.shape)

Shape: (4920, 133)


In [52]:
X = df.drop(columns=["prognosis"])           # ALL 132 binary symptom columns
y = df["prognosis"]

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)

print("n_features:", X.shape[1], "  n_classes:", len(le.classes_))


n_features: 132   n_classes: 41


In [54]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Hold‑out 20 % *before* any fitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.20, stratify=y_enc, random_state=42)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [55]:
aug_rows = []
for row, label in zip(X.values, y_enc):
    ones = np.where(row == 1)[0]
    for keep_frac in (0.6, 0.8):  # you can add more variants
        k = max(1, int(len(ones) * keep_frac))
        keep_idx = np.random.choice(ones, k, replace=False)
        new_row = np.zeros_like(row)
        new_row[keep_idx] = 1
        aug_rows.append((new_row, label))

# Stack originals + partials
X_aug = np.vstack([X.values] + [r for r, _ in aug_rows])
y_aug = np.hstack([y_enc] + [l for _, l in aug_rows])

print("Original size   :", X.shape)
print("Augmented size  :", X_aug.shape)

Original size   : (4920, 132)
Augmented size  : (14760, 132)


In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X_aug, y_aug, test_size=0.2, stratify=y_aug, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
rf=RandomForestClassifier(
        n_estimators=600, max_depth=None, class_weight="balanced",
        n_jobs=-1, random_state=42)
et=ExtraTreesClassifier(
        n_estimators=500, max_features="sqrt", class_weight="balanced",
        n_jobs=-1, random_state=42)
gb=GradientBoostingClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=3, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [42]:
models={
    "randomforest":RandomForestClassifier(),
    "GradientBoosting":GradientBoostingClassifier(),
    "AdaBoost":AdaBoostClassifier()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    prediction_test=model.predict(X_test)
    prediction_train=model.predict(X_train)
    #train
    accuracy_train=accuracy_score(y_train,prediction_train)
    f1_score_train=f1_score(y_train,prediction_train,average="weighted")
    precision_train=precision_score(y_train,prediction_train,average="weighted")
    recall_train=recall_score(y_train,prediction_train,average="weighted")

    # test
    accuracy_test=accuracy_score(y_test,prediction_test)
    f1_score_test=f1_score(y_test,prediction_test,average="weighted")
    precision_test=precision_score(y_test,prediction_test,average="weighted")
    recall_test=recall_score(y_test,prediction_test,average="weighted")


    print(list(models.keys())[i])
    print("----------------------------")
    print("for Train Data set")
    print(f"accuracy:{accuracy_train:.4f} ")
    print(f"f1_score: {f1_score_train:.4f}")
    print(f"precision: {precision_train:.4f}")
    print(f"recall: {recall_train:.4f}")
    print("----------------------------")
    print("for Test Data set")
    print(f"accuracy:{accuracy_test:.4f} ")
    print(f"f1_score:{f1_score_test:.4f} ")
    print(f"precision:{precision_test:.4f}")
    print(f"recall: {recall_test:.4f}")


randomforest
----------------------------
for Train Data set
accuracy:0.9958 
f1_score: 0.9958
precision: 0.9958
recall: 0.9958
----------------------------
for Test Data set
accuracy:0.9942 
f1_score:0.9942 
precision:0.9944
recall: 0.9942
GradientBoosting
----------------------------
for Train Data set
accuracy:0.9956 
f1_score: 0.9956
precision: 0.9957
recall: 0.9956
----------------------------
for Test Data set
accuracy:0.9912 
f1_score:0.9912 
precision:0.9914
recall: 0.9912
AdaBoost
----------------------------
for Train Data set
accuracy:0.1069 
f1_score: 0.0903
precision: 0.0980
recall: 0.1069
----------------------------
for Test Data set
accuracy:0.1013 
f1_score:0.0865 
precision:0.0968
recall: 0.1013


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
gb_params={
    "loss":["log_loss","exponential"],
    "n_estimators":[1,10,100,1000],
    "criterion":["friedman_mse","squared_error"]
}

In [44]:
randomcv_models=[
    ("randomforest",RandomForestClassifier(),rf_params),
    ("gradient",GradientBoostingClassifier(),gb_params)
]

In [45]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=15, max_features=7, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV] END max_depth=15, max_features=7, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV] END max_depth=15, max_features=7, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=500; total time=   1.6s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=500; total time=   1.6s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=500; total time=   1.5s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=1000; total time=   3.1s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=1000; total time=   3.1s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=1000; total time=   3.3s
[CV] END max_depth=5, max_features=7, min_samples_split=20, n_estim

84 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/s

Fitting 3 folds for each of 16 candidates, totalling 48 fits




[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=1; total time=   1.0s
[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=1; total time=   1.1s
[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=1; total time=   1.1s
[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=10; total time=   9.4s
[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=10; total time=   9.8s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=1; total time=   0.0s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=1; total time=   0.0s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=1; total time=   0.0s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=10; total time=   0.0s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=10; total time=   0.0s
[CV] END criterion=friedman_mse, loss=log_loss, n_estimators=10; total time=   9.9s
[CV] END criterion=friedman_mse, loss=exponential, n_estimators=10;

24 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/sklearn/ensemble/_gb.py", line 431, in fit
    self._check_params()
  File "/Users/rishabhkapur/Desktop/python/venv/lib/python3.10/site-packages/s

---------------- Best Params for randomforest -------------------
{'n_estimators': 500, 'min_samples_split': 8, 'max_features': 5, 'max_depth': None}
---------------- Best Params for gradient -------------------
{'n_estimators': 100, 'loss': 'log_loss', 'criterion': 'squared_error'}


In [46]:
models={
    "randomforest":RandomForestClassifier(n_estimators= 500,min_samples_split=8,max_features=5, max_depth=None),
    "GradientBoosting":GradientBoostingClassifier(n_estimators=100, loss='log_loss',criterion='squared_error')
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    prediction_test=model.predict(X_test)
    prediction_train=model.predict(X_train)
    #train
    accuracy_train=accuracy_score(y_train,prediction_train)
    f1_score_train=f1_score(y_train,prediction_train,average="weighted")
    precision_train=precision_score(y_train,prediction_train,average="weighted")
    recall_train=recall_score(y_train,prediction_train,average="weighted")

    # test
    accuracy_test=accuracy_score(y_test,prediction_test)
    f1_score_test=f1_score(y_test,prediction_test,average="weighted")
    precision_test=precision_score(y_test,prediction_test,average="weighted")
    recall_test=recall_score(y_test,prediction_test,average="weighted")


    print(list(models.keys())[i])
    print("----------------------------")
    print("for Train Data set")
    print(f"accuracy:{accuracy_train:.4f} ")
    print(f"f1_score: {f1_score_train:.4f}")
    print(f"precision: {precision_train:.4f}")
    print(f"recall: {recall_train:.4f}")
    print("----------------------------")
    print("for Test Data set")
    print(f"accuracy:{accuracy_test:.4f} ")
    print(f"f1_score:{f1_score_test:.4f} ")
    print(f"precision:{precision_test:.4f}")
    print(f"recall: {recall_test:.4f}")


randomforest
----------------------------
for Train Data set
accuracy:0.9956 
f1_score: 0.9956
precision: 0.9957
recall: 0.9956
----------------------------
for Test Data set
accuracy:0.9946 
f1_score:0.9946 
precision:0.9947
recall: 0.9946
GradientBoosting
----------------------------
for Train Data set
accuracy:0.9956 
f1_score: 0.9956
precision: 0.9957
recall: 0.9956
----------------------------
for Test Data set
accuracy:0.9915 
f1_score:0.9915 
precision:0.9917
recall: 0.9915


In [58]:
randomforest=RandomForestClassifier(n_estimators= 500,min_samples_split=8,max_features=5, max_depth=None)
randomforest.fit(X_train,y_train)

In [13]:
rf.fit(X_train,y_train)

NameError: name 'rf' is not defined

In [25]:
cv_scoresrf=cross_val_score(rf, X_train, y_train,
                                   cv=skf, scoring="accuracy", n_jobs=-1).mean()
print(cv_scoresrf)

0.9944953648065283


In [21]:
cv_scoreset=cross_val_score(et, X_train, y_train,
                                   cv=skf, scoring="accuracy", n_jobs=-1).mean()
print(cv_scoreset)


0.9940719589175069


In [22]:
cv_scoresgb=cross_val_score(gb, X_train, y_train,
                                   cv=skf, scoring="accuracy", n_jobs=-1).mean()
print(cv_scoresgb)

KeyboardInterrupt: 

In [59]:
## we find that random forest is best 
import pickle 
pickle.dump(randomforest,open("randomforest_new.pkl",'wb'))