In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

In [143]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/disease-symptoms-and-patient-profile-dataset/Disease_symptom_and_patient_profile_dataset.csv


In [144]:
df = pd.read_csv('/kaggle/input/disease-symptoms-and-patient-profile-dataset/Disease_symptom_and_patient_profile_dataset.csv')

In [None]:
df.head

In [None]:
df.duplicated().sum()

In [None]:
df.head()

In [None]:
# Bar plot for Gender
sns.countplot(x='Gender', data=df)
plt.title('Distribution of Gender')
plt.show()

In [None]:
sns.catplot(x = 'Outcome Variable' , y = 'Age' , data = df , kind = "swarm")  

In [None]:
sns.displot(df['Age'] , kde=True)

In [64]:
#dropping name column
df = df.iloc[:,1:]

In [145]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X = df.drop(['Outcome Variable'], axis=1)
y = df['Outcome Variable']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [146]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
# Initialize StandardScaler

In [147]:

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the feature categories
binary_cols = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
categorical_cols = ['Blood Pressure', 'Cholesterol Level', 'Gender']
numerical_cols = ['Age']

# Define transformers
ordinal_transformer = OrdinalEncoder(categories=[['No', 'Yes']] * len(binary_cols))
onehot_transformer = OneHotEncoder(drop='first', sparse=False)
scaler = StandardScaler()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', ordinal_transformer, binary_cols),
        ('categorical', onehot_transformer, categorical_cols),
        ('scaler', scaler, numerical_cols)
    ]
)


In [148]:
onehot_encoder = OneHotEncoder(sparse=False)

In [149]:
X_train_transformed = preprocessor.fit_transform(X_train)



In [150]:
y_train_reshaped = np.array(y_train).reshape(-1, 1)
y_test_reshaped = np.array(y_test).reshape(-1, 1)

# Fit and transform the training data
y_train_encoded = onehot_encoder.fit_transform(y_train_reshaped)

# Transform the test data
y_test_encoded = onehot_encoder.transform(y_test_reshaped)



In [151]:
X = df.drop(['Outcome Variable'], axis= 1)
y = df['Outcome Variable']

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [93]:
def compare_algorithms(X_train, y_train, algorithms, cv=5, scoring='accuracy'):
    results = {}
    
    for name, algorithm in algorithms.items():
        # Perform cross-validation
        scores = cross_val_score(algorithm, X_train, y_train, cv=cv, scoring=scoring)
        
        # Store the mean and standard deviation of the scores
        results[name] = {
            'mean_accuracy': np.mean(scores),
            'std_accuracy': np.std(scores)
        }
        
        # Print the results for each algorithm
        print(f"{name} - Mean Accuracy: {results[name]['mean_accuracy']:.4f} | Std Accuracy: {results[name]['std_accuracy']:.4f}")
    
    return results

In [None]:
def plot_accuracies(accuracies):
    """
    Plots the mean accuracies and their standard deviations for different algorithms.
    
    Parameters:
    accuracies (dict): A dictionary where keys are algorithm names and values are dictionaries
                       containing 'mean_accuracy' and 'std_accuracy'.
    """
    # Extracting algorithm names, mean accuracies, and standard deviations
    names = list(accuracies.keys())
    mean_accuracies = [accuracies[name]['mean_accuracy'] for name in names]
    std_accuracies = [accuracies[name]['std_accuracy'] for name in names]
    
    # Creating the bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(names, mean_accuracies, yerr=std_accuracies, capsize=5, color='skyblue', alpha=0.7)
    
    # Adding plot details
    plt.xlabel('Algorithms')
    plt.ylabel('Mean Accuracy')
    plt.title('Algorithm Comparison: Mean Accuracy with Standard Deviation')
    plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [71]:
df
X_test

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level
88,No,Yes,Yes,No,35,Male,High,High
83,No,No,Yes,No,35,Male,High,High
267,Yes,No,Yes,No,55,Female,Normal,Normal
102,Yes,Yes,No,No,38,Female,High,Normal
270,No,Yes,No,No,55,Male,Normal,Low
...,...,...,...,...,...,...,...,...
263,Yes,Yes,No,No,55,Female,High,High
210,Yes,Yes,Yes,No,50,Male,Normal,High
231,No,No,Yes,No,50,Female,Normal,Normal
154,No,Yes,Yes,Yes,42,Female,Normal,Low


In [154]:
# importing libraries
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression()
lreg.fit(x_train_transformed,y_train)

In [155]:
from sklearn.metrics import accuracy_score
# predicting values and checking accuracy
lpred = lreg.predict(x_test_transformed)
accuracy_score(lpred , y_test)

0.6285714285714286

In [156]:
#importing libraries
from sklearn.neighbors import KNeighborsClassifier

In [157]:
#implementing KNN
knn = KNeighborsClassifier()
knn.fit(x_train_transformed,y_train)

In [158]:
#predicting values and testing accuracy
kpred = knn.predict(x_test_transformed)
accuracy_score(kpred,y_test)

0.7

In [159]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [160]:
# Define the classifier model
classifier = RandomForestClassifier(n_estimators=100, random_state=0)

# Create the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on test data
accuracy = pipeline.score(X_test, y_test)
print(f'Model accuracy: {accuracy:.2f}')

Model accuracy: 0.70


In [141]:

# Define the algorithms
algorithms = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0)
}

# Compare the algorithms
results = compare_algorithms(preprocessor.fit_transform(X_train), y_train, algorithms)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Mean Accuracy: 0.6272 | Std Accuracy: 0.0795
K-Nearest Neighbors - Mean Accuracy: 0.6562 | Std Accuracy: 0.0521
Random Forest - Mean Accuracy: 0.6919 | Std Accuracy: 0.0730
