In [1]:
import pandas as pd

# Read the CSV file


In [2]:
file_path = 'Disease_Prediction.csv'
df = pd.read_csv(file_path)


In [3]:
print(df.head())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling  silver_like_dusting  

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [8]:
df = df.drop(columns=['Unnamed: 133'])

# Handle missing values


In [9]:
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [10]:
label_encoder = LabelEncoder()
df_imputed['prognosis'] = label_encoder.fit_transform(df_imputed['prognosis'])

In [12]:
X = df_imputed.drop(columns=['prognosis'])
y = df_imputed['prognosis']

# Split the data into training and testing sets


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression


In [14]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Decision Tree


In [15]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
y_pred_dec_tree = dec_tree.predict(X_test)

# Random Forest


In [16]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train, y_train)
y_pred_rand_forest = rand_forest.predict(X_test)

# Evaluate the models performance


In [17]:
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

In [20]:
log_reg_metrics = evaluate_model(y_test, y_pred_log_reg)
print(f"Logistic Regression - Accuracy: {log_reg_metrics[0]}, Precision: {log_reg_metrics[1]}, Recall: {log_reg_metrics[2]}, F1-score: {log_reg_metrics[3]}")

dec_tree_metrics = evaluate_model(y_test, y_pred_dec_tree)
print(f"Decision Tree - Accuracy: {dec_tree_metrics[0]}, Precision: {dec_tree_metrics[1]}, Recall: {dec_tree_metrics[2]}, F1-score: {dec_tree_metrics[3]}")

rand_forest_metrics = evaluate_model(y_test, y_pred_rand_forest)
print(f"Random Forest - Accuracy: {rand_forest_metrics[0]}, Precision: {rand_forest_metrics[1]}, Recall: {rand_forest_metrics[2]}, F1-score: {rand_forest_metrics[3]}")

Logistic Regression - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1-score: 1.0
Decision Tree - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1-score: 1.0
Random Forest - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1-score: 1.0
