In [None]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns





df = pd.read_csv('../input/healthcare-dataset/healthcare_dataset.csv')

df.head()

# Data analysis and preprocessing




### dataset info

In [None]:
df.info()

In [None]:
#  unique value of each colum just the number



for column in df.columns:

  print(f"{column}: {df[column].nunique()}")

In [None]:
# prompt: print unique name the values in column Test Results , Blood Type,Medical Condition



for column in [ 'Blood Type', 'Medical Condition','Test Results']:

  print(f"Unique values in {column}: {df[column].unique()}")

### Distribution Count based on Medical Condition


In [None]:


medical_condition_counts = df['Medical Condition'].value_counts()

print("\nDistribution Count based on Medical Condition:")

print(medical_condition_counts)



plt.figure(figsize=(14, 5))



plt.subplot(1, 2, 1)

sns.countplot(x='Medical Condition', data=df, palette='viridis')

plt.title('Distribution of Medical Conditions - Bar Chart')

plt.xlabel('Medical Condition')

plt.ylabel('Count')

plt.xticks(rotation=45, ha='right')



plt.subplot(1, 2, 2)

plt.pie(medical_condition_counts, labels=medical_condition_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('viridis'))

plt.title('Distribution of Medical Conditions - Pie Chart')



plt.tight_layout()

plt.show()


### Distribution Count based on Test Results


In [None]:


test_results_counts = df['Test Results'].value_counts()

print("\nDistribution Count based on Test Results:")

print(test_results_counts)



plt.figure(figsize=(12, 5))



plt.subplot(1, 2, 1)

sns.countplot(x='Test Results', data=df, palette='viridis')

plt.title('Distribution of Test Results - Bar Chart')

plt.xlabel('Test Results')

plt.ylabel('Count')

plt.xticks(rotation=45, ha='right')



plt.subplot(1, 2, 2)

plt.pie(test_results_counts, labels=test_results_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('viridis'))

plt.title('Distribution of Test Results - Pie Chart')



plt.tight_layout()

plt.show()


 ###   Distribution Count based on gender




In [None]:


gender_counts = df['Gender'].value_counts()



plt.figure(figsize=(12, 5))



plt.subplot(1, 2, 1)

gender_counts.plot(kind='barh', color=sns.color_palette('Dark2'))

plt.title('Distribution by Gender - Bar Chart')

plt.xlabel('Count')

plt.gca().spines[['top', 'right']].set_visible(False)



plt.subplot(1, 2, 2)

plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Dark2'))

plt.title('Distribution by Gender - Pie Chart')



plt.tight_layout()

plt.show()


 ###   Distribution Count based on Blood Type




In [None]:
df.groupby('Blood Type').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))

plt.gca().spines[['top', 'right',]].set_visible(False)

### table of gender and Test Result


In [None]:


contingency_table = pd.crosstab(df['Gender'], df['Test Results'])



print("\nContingency Table (Gender vs. Test Results):")

print(contingency_table)



plt.figure(figsize=(6,3))

sns.heatmap(contingency_table, annot=True, fmt="d", cmap="YlGnBu")

plt.title("Gender vs. Test Results")

plt.xlabel("Test Results")

plt.ylabel("Gender")

plt.show()

### table of Blood Type and Test Result


In [None]:


contingency_table = pd.crosstab(df['Blood Type'], df['Test Results'])



print("\nContingency Table (Blood Type  vs. Test Results):")

print(contingency_table)



plt.figure(figsize=(6,3))

sns.heatmap(contingency_table, annot=True, fmt="d", cmap="YlGnBu")

plt.title("Blood Type  vs. Test Results")

plt.xlabel("Test Results")

plt.ylabel("Blood Type")

plt.show()

### table of Medical Condition and Test Result


In [None]:


contingency_table = pd.crosstab(df['Medical Condition'], df['Test Results'])



print("\nContingency Table (Medical Condition vs. Test Results):")

print(contingency_table)



# You can also visualize this using a heatmap

plt.figure(figsize=(6,3))

sns.heatmap(contingency_table, annot=True, fmt="d", cmap="YlGnBu")

plt.title("Medical Condition vs. Test Results")

plt.xlabel("Test Results")

plt.ylabel("Medical Condition")

plt.show()

### table of Age and Test Result


In [None]:


# Create age groups

df['Age Group'] = pd.cut(df['Age'], bins=range(0, 100, 10), right=False)



# Group by age group and test results, then count the occurrences

grouped_data = df.groupby(['Age Group', 'Test Results'] , observed=False)['Age'].count().unstack()



# Plot the bar chart

grouped_data.plot(kind='bar', figsize=(10,5))

plt.title('Number of Ages and Test Results by Age Group')

plt.xlabel('Age Group')

plt.ylabel('Count')

plt.xticks(rotation=45)

plt.legend(title='Test Results')

plt.show()

# Can Machine Learning Classification Algorithms help to predict Test Result ?

### test classification algorithms and compare their results

In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('healthcare_dataset.csv')


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

df.fillna(df.mean(), inplace=True)  


le_gender = LabelEncoder()
le_medical_condition = LabelEncoder()
le_blood_type = LabelEncoder()

df['Gender'] = le_gender.fit_transform(df['Gender'])
df['Medical Condition'] = le_medical_condition.fit_transform(df['Medical Condition'])
df['Blood Type'] = le_blood_type.fit_transform(df['Blood Type'])

X = df[['Age', 'Gender', 'Blood Type', 'Medical Condition']]

scaler = StandardScaler()
X[['Age']] = scaler.fit_transform(X[['Age']])

y = df['Test Results']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        'Accuracy': accuracy,
        'Classification Report': classification_report(y_test, y_pred),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }

for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print("Classification Report:")
    print(result['Classification Report'])
    print("Confusion Matrix:")
    print(result['Confusion Matrix'])

## As a result, we can see that none of the machine learning classification algorithms can get a good accuracy in prediction