In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv(r"Data\\Training.csv")
data.head()

In [3]:
row, col = data.shape[0], data.shape[1]
print(f"Row: {row}, col: {col}")

In [4]:
# data is already processed in the dataset.
data.isnull().sum()

In [5]:
print(data['prognosis'].unique())

In [6]:
prognosis_count = len(data['prognosis'].unique())
print(f"There are {prognosis_count} unique diseases for classification.")

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
X = data.drop('prognosis', axis=1)
y = data['prognosis']

In [9]:
X

In [10]:
y # before label encoding

In [11]:
# encoding the output variable to make it understandable for the machine learning model.
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(y)

In [12]:
Y # after label encoding

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=20)

In [14]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

### _Training using Top Models_

In [15]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
# init the models objects here.
SVC = SVC(kernel='linear')
RF = RandomForestClassifier(n_estimators=100, random_state=42)
GB = GradientBoostingClassifier(n_estimators=100, random_state=42)
NB = MultinomialNB()
KNN = KNeighborsClassifier(n_neighbors=5)

In [17]:
models = {
    "Support Vector Classifier": SVC,
    "Random Forest Classifier": RF,
    "Gradient Boosting Classifier": GB,
    "Naive Bayes Classifier": NB,
    "K Nearest Neighbour": KNN
}

for model_name, model in models.items():
    # training the model here.
    model.fit(X_train, Y_train)
    # testing the model here.
    score = model.score(X_test, Y_test)
    # display the accuracy of the model here.
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    # display the confusion matrix here
    cm = confusion_matrix(Y_test, y_pred)
    # display the result here
    print(f"Model: {model_name}, Score: {score}, Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{np.array2string(cm, separator=", ")}\n")

### _Prediction Test_

In [18]:
model_svc = models["Support Vector Classifier"]
model_svc.fit(X_train, Y_train)
y_pred = model_svc.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
cm = confusion_matrix(Y_test, y_pred)
print(f"Model Support Vector Classifier: {model_svc}")
print(f"Accuracy: {round(accuracy*100)}%")
print(f"Confusion Matrix:\n{np.array2string(cm, separator=", ")}")

### _Saving the Model here_

In [19]:
import pickle
pickle.dump(model_svc, open('Model\\model.pkl', 'wb'))

In [20]:
# test - 1
test_data_1 = X_test.iloc[0].values.reshape(1,-1) # taking out 1 row for testing
print(test_data_1)

In [21]:
model = pickle.load(open('Model\\model.pkl', 'rb'))
pred = model.predict(test_data_1)
print("Model Prediction", pred[0]) # model predicted output
print("Actual Output", Y_test[[0]][0]) # actual output

In [22]:
# test - 2
test_data_2 = X_test.iloc[69].values.reshape(1,-1) # taking out 1 row for testing
print(test_data_2)

In [23]:
pred = model.predict(test_data_2)
print("Model Prediction", pred[0])
print("Actual Output", Y_test[[69]][0])