In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from scipy.stats import chi2_contingency

In [2]:
pip install --upgrade scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip show scikit-learn

Name: scikit-learn
Version: 1.4.1.post1
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /home/sai_abbhiram/.local/lib/python3.11/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# DATA UNDERSTANDING

In [4]:
df = pd.read_csv('D:/COAPPS INTERN/health_care.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'D:/COAPPS INTERN/health_care.csv'

In [None]:
df.head()

In [None]:
 df.shape

In [None]:
 df.info

In [None]:
print("Columns in the dataset:")
for column in df.columns:
    print(column)

In [None]:
 df.describe()

In [None]:
df.describe(include='O')

In [None]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}: {unique_values}")

## DATA CLEANING

In [None]:
df.drop(['Name'],axis=1,inplace = True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = column[(column < lower_bound) | (column > upper_bound)]
    return outliers

outliers_age = detect_outliers_iqr(df['Age'])

if not outliers_age.empty:
    print("Outliers in the 'Age' column:")
    print(outliers_age)
else:
    print("No outliers found in the 'Age' column.")


## EXPLORATORY DATA ANALYSIS

In [None]:
for col in df.columns:
    most_frequent_values = df[col].value_counts().head(5)
    print(f"Most frequent values in {col}:")
    print(most_frequent_values)
    print()

In [None]:
from scipy.stats import chi2_contingency
import random

p_values = []

for column in df.columns:
    if column != 'Medication':
        contingency_table = pd.crosstab(df[column], df['Medication'])
        chi2_stat, p_val, _, _ = chi2_contingency(contingency_table)
        p_values.append((column, p_val))


In [None]:
plt.figure(figsize=(6, 2)) 
plt.hist(df['Age'],bins=30,edgecolor="black")
plt.xlabel('AGE')
plt.ylabel('FREQUENCY')
plt.title('VISUALIZATION OF THE AGE GROUPS MOST FREQUENTLY AFFECTED BY ILLNESS')
plt.show()

In [None]:
custom_palette = {'Male': 'red', 'Female': 'purple'}
plt.figure(figsize=(6, 2))
sns.countplot(y='Gender', data=df, order=df['Gender'].value_counts().index, palette=custom_palette)
plt.title('GENDER DISTRIBUTION')
plt.xlabel('COUNT')
plt.ylabel('GENDER')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
df['Blood Type'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['red', 'blue', 'green', 'pink','purple','yellow','lightblue','lightgreen'])
plt.title('BLOOD TYPE DISTRIBUTION')
plt.ylabel('')
plt.show()


In [None]:
plt.figure(figsize=(8, 7))
df['Disease'].value_counts().sort_values().plot(kind='barh', color='red')
plt.title('THE AGGREGATE COUNT OF INDIVIDUALS AFFLICTED BY A SPECIFIC ILLNESS.')
plt.xlabel('COUNT OF PEPOPLE')
plt.ylabel('LIST OF DISEASE')
plt.show()

In [None]:
plt.figure(figsize=(5, 3))
sns.countplot(x='Test Result', hue='Test Result', data=df, palette='pastel')
plt.title(' COUNT OF TEST RESULT')
plt.xlabel('TEST RESULT')
plt.ylabel('COUNT')
plt.show()

## DATA PREPROCESSING

In [None]:
categorical_columns = ['Disease','Test Result']
numerical_columns = ['Age']

X = df[categorical_columns + numerical_columns]
y = df['Medication']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_columns,)
    ],
    remainder='passthrough'
)

X = preprocessor.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 22)

## DATA MODELLING

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=2)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print("Decision Tree Classifier")
print("Accuracy: {:.2f}".format(accuracy))

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5)
print("CROSS VALIDATION FOR DECISION TREE CLASSIFIER")
print("CROSS VALIDATION SCORES:", cv_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(random_state=22)
classifier2.fit(X_train, y_train)

In [None]:
y_pred_train = classifier2.predict(X_train)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print("Random Forest Classifier")
print("Accuracy: {:.2f}".format(accuracy))

In [None]:
cv_scores = cross_val_score(classifier2, X_train, y_train, cv=5)
print("CROSS VALIDATION FOR RANDOM FOREST CLASSIFIER")
print("CROSS VALIDATION SCORES:", cv_scores)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifier4 = KNeighborsClassifier() 
classifier4.fit(X_train_scaled, y_train)


In [None]:
y_pred = classifier4.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print("KNN")
print("TAccuracy: {:.2f}".format(accuracy))

In [None]:
cv_scores = cross_val_score(classifier4, X_train_scaled, y_train, cv=5)
print("CROSS VALIDATION FOR KNN")
print("CROSS VALIDATION SCORES:", cv_scores)

In [None]:
from sklearn.svm import SVC

classifier6 = SVC(random_state=29)
classifier6.fit(X_train_scaled, y_train)


In [None]:
y_pred = classifier6.predict(X_test_scaled)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print("Support Vector Machines")
print("Accuracy: {:.2f}".format(accuracy))

In [None]:
cv_scores = cross_val_score(classifier6, X_train_scaled, y_train, cv=5)
print("CROSS VALIDATION FOR SUPPORT VECTOR MACHINES")
print("CROSS VALIDATION SCORES:", cv_scores)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=9)
logreg.fit(X_train_scaled, y_train)

In [None]:
y_pred = logreg.predict(X_test_scaled)
accuracy = metrics.accuracy_score(y_test, y_pred)*100
print("Support Vector Machines")
print("Accuracy: {:.2f}".format(accuracy))

In [None]:
cv_scores = cross_val_score(logreg, X_train_scaled, y_train, cv=5)
print("CROSS VALIDATION FOR LOGISTIC REGRESSION")
print("CROSS VALIDATION SCORES:", cv_scores)

In [None]:
def prediction(age, disease, test_result):

    features = pd.DataFrame([[age, disease, test_result]],
                            columns=['Age', 'Disease','Test Result'])

    transformed_features = preprocessor.transform(features)

    predicted_medication = classifier.predict(transformed_features).reshape(1, -1)

    return predicted_medication[0]

age = 35
disease = 'Tuberculosis'
test_results = 'Inconclusive'
result = prediction(age,disease,test_results)
print(result)

In [None]:
import joblib
joblib.dump(classifier, 'decision_tree_model.pkl')

In [None]:
joblib.dump(preprocessor, 'preprocessing_pipeline.pkl')