In [72]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, root_mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [74]:
df=pd.read_csv('C:\\Users\\start\\OneDrive\\Documents\\Machine Learning\\medical_conditions_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10000 non-null  int64  
 1   full_name       10000 non-null  object 
 2   age             5445 non-null   float64
 3   gender          10000 non-null  object 
 4   smoking_status  10000 non-null  object 
 5   bmi             4652 non-null   float64
 6   blood_pressure  3766 non-null   float64
 7   glucose_levels  4756 non-null   float64
 8   condition       10000 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 703.3+ KB


In [76]:
df.drop(columns=["id","full_name"],inplace=True) #drop Unnecessary columns

In [78]:
df = df.drop_duplicates()

In [80]:
df.isnull().sum()

age               3776
gender               0
smoking_status       0
bmi               3953
blood_pressure    4839
glucose_levels    3849
condition            0
dtype: int64

#Handle Missing Values with Knn Imputer

In [110]:
imputer = KNNImputer(n_neighbors=5)
data=df.select_dtypes(include=[np.number])
imputer.fit(data)
X=imputer.transform(data)

In [111]:
X_df=pd.DataFrame(X,columns=data.columns,index=data.index)
df[data.columns]=X_df

In [86]:
df.head(10)

Unnamed: 0,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,53.540692,male,Non-Smoker,27.42342,135.209429,135.219608,Pneumonia
1,30.0,male,Non-Smoker,28.92477,105.315064,148.837937,Diabetic
2,18.0,male,Non-Smoker,35.612486,138.15331,153.485514,Pneumonia
3,54.0,male,Non-Smoker,25.621843,99.119829,110.798413,Pneumonia
4,76.0,male,Non-Smoker,26.551568,134.310935,155.19092,Diabetic
5,40.0,male,Non-Smoker,33.840723,135.082106,168.083928,Diabetic
6,49.0,male,Smoker,24.43015,131.365032,153.151126,Cancer
7,47.0,male,Non-Smoker,32.384141,115.826322,199.339699,Diabetic
8,64.4,male,Non-Smoker,39.649679,136.121826,111.475528,Diabetic
9,65.0,male,Smoker,25.640817,142.096645,149.056644,Diabetic


In [89]:
df["age"]=df["age"].astype(np.int8)
df["gender"]=df["gender"].astype("category")
df["smoking_status"]=df["smoking_status"].astype("category")
df["bmi"]=df["bmi"].astype(np.float16)
df["blood_pressure"]=df["blood_pressure"].astype(np.float16)
df["glucose_levels"]=df["glucose_levels"].astype(np.float16)
df["condition"]=df["condition"].astype("category")


In [91]:
df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
Index: 8605 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             8605 non-null   int8    
 1   gender          8605 non-null   category
 2   smoking_status  8605 non-null   category
 3   bmi             8605 non-null   float16 
 4   blood_pressure  8605 non-null   float16 
 5   glucose_levels  8605 non-null   float16 
 6   condition       8605 non-null   category
dtypes: category(3), float16(3), int8(1)
memory usage: 152.0 KB


In [93]:

Q1 = df[['age', 'bmi', 'blood_pressure', 'glucose_levels']].quantile(0.25)
Q3 = df[['age', 'bmi', 'blood_pressure', 'glucose_levels']].quantile(0.75)
IQR = Q3 - Q1

outlier_condition = ~((df[['age', 'bmi', 'blood_pressure', 'glucose_levels']] < (Q1 - 1.5 * IQR)) |
                      (df[['age', 'bmi', 'blood_pressure', 'glucose_levels']] > (Q3 + 1.5 * IQR))).any(axis=1)

df = df[outlier_condition]

Using: OneHotEncoder (for representing categorical variables as binary vectors) 
&
Label Encoder (for converting labels/words into numeric form)

In [96]:
one_hot=OneHotEncoder()
cat_columns = ['gender',"smoking_status"]
e_df = one_hot.fit_transform(df[cat_columns])
e_df=e_df.toarray()
e_df = pd.DataFrame(e_df, columns=one_hot.get_feature_names_out(cat_columns))
df = pd.concat([df.drop(columns=cat_columns), e_df], axis=1)

In [97]:
label_encoder = LabelEncoder()
df['condition'] = label_encoder.fit_transform(df['condition'])

In [100]:
df.head(10)

Unnamed: 0,age,bmi,blood_pressure,glucose_levels,condition,gender_female,gender_male,smoking_status_Non-Smoker,smoking_status_Smoker
0,53.0,27.421875,135.25,135.25,2,0.0,1.0,1.0,0.0
1,30.0,28.921875,105.3125,148.875,1,0.0,1.0,1.0,0.0
2,18.0,35.625,138.125,153.5,2,0.0,1.0,1.0,0.0
3,54.0,25.625,99.125,110.8125,2,0.0,1.0,1.0,0.0
4,76.0,26.546875,134.25,155.25,1,0.0,1.0,1.0,0.0
5,40.0,33.84375,135.125,168.125,1,0.0,1.0,1.0,0.0
6,49.0,24.4375,131.375,153.125,0,0.0,1.0,0.0,1.0
7,47.0,32.375,115.8125,199.375,1,0.0,1.0,1.0,0.0
8,64.0,39.65625,136.125,111.5,1,0.0,1.0,1.0,0.0
9,65.0,25.640625,142.125,149.0,1,0.0,1.0,0.0,1.0


In [102]:
df.drop(columns=["gender_female","smoking_status_Non-Smoker"],inplace=True)

In [104]:
df = df.rename(columns=({'gender_male': 'gender', 'smoking_status_Smoker': 'smoking_status'}) )

In [106]:
df.head(10)

Unnamed: 0,age,bmi,blood_pressure,glucose_levels,condition,gender,smoking_status
0,53.0,27.421875,135.25,135.25,2,1.0,0.0
1,30.0,28.921875,105.3125,148.875,1,1.0,0.0
2,18.0,35.625,138.125,153.5,2,1.0,0.0
3,54.0,25.625,99.125,110.8125,2,1.0,0.0
4,76.0,26.546875,134.25,155.25,1,1.0,0.0
5,40.0,33.84375,135.125,168.125,1,1.0,0.0
6,49.0,24.4375,131.375,153.125,0,1.0,1.0
7,47.0,32.375,115.8125,199.375,1,1.0,0.0
8,64.0,39.65625,136.125,111.5,1,1.0,0.0
9,65.0,25.640625,142.125,149.0,1,1.0,1.0


In [114]:
df.isnull().sum()

age               0
bmi               0
blood_pressure    0
glucose_levels    0
condition         0
gender            0
smoking_status    0
dtype: int64

Apply Standarization using StandartScaler

In [117]:
scaler = StandardScaler()
numeric_columns = ['age', 'bmi', 'blood_pressure', 'glucose_levels']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [119]:
X = df.drop(['condition'], axis=1)
y = df['condition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  stratify=y, random_state=42)

Perform Linear Regression & KNN as regressor

In [122]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse_lr = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred) 

print("MSE:", mse)
print("MAE:", mae)
print("R Squared Score:", r2)
print("Root Mean Squared Error: ",rmse_lr)

MSE: 0.7176912131274827
MAE: 0.6940574147213712
R Squared Score: 0.00537763228779875
Root Mean Squared Error:  0.8471665793263345


In [124]:
knn_model = KNeighborsRegressor(n_neighbors=8)
knn_model.fit(X_train, y_train)


y_pred_knn = knn_model.predict(X_test)


mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)
rmse_knn = root_mean_squared_error(y_test, y_pred_knn)
mae = mean_absolute_error(y_test, y_pred) 


print("KNN Regressor - MSE:", mse_knn)
print("KNN Regressor - MAE:", mae)
print("KNN Regressor - R2 Score:", r2_knn)
print("Root Mean Squared Error For KNN: ",rmse_knn)

KNN Regressor - MSE: 0.4003265881147541
KNN Regressor - MAE: 0.6940574147213712
KNN Regressor - R2 Score: 0.44520181988334206
Root Mean Squared Error For KNN:  0.6327136699287871


In [126]:
###Compare the two models

print("\nComparison Between Linear Regression and KNN Regressor:")
print(f"Linear Regression - RMSE: {mse}, R²: {r2}")
print(f"KNN Regressor - RMSE: {mse_knn}, R²: {r2_knn}")

if mse < mse_knn and r2 > r2_knn:
    print("\nLinear Regression performs better based on lower MSE and higher R².")
elif mse_knn < mse and r2_knn > r2:
    print("\nKNN Regressor performs better based on lower MSE and higher R².")
else:
    print("\nThe performance of the two models is comparable, or trade-offs exist between MSE and R².")


Comparison Between Linear Regression and KNN Regressor:
Linear Regression - RMSE: 0.7176912131274827, R²: 0.00537763228779875
KNN Regressor - RMSE: 0.4003265881147541, R²: 0.44520181988334206

KNN Regressor performs better based on lower MSE and higher R².
