In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # Import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [72]:
url = "https://raw.githubusercontent.com/MitaliP001/patient-treatment-classification/main/training_set.csv"
data = pd.read_csv(url)

In [73]:
# Print the first few rows (default is 5 rows)
print(data.head())

   HAEMATOCRIT  HAEMOGLOBINS  ERYTHROCYTE  LEUCOCYTE  THROMBOCYTE   MCH  MCHC  \
0         33.8          11.1         4.18        4.6          150  26.6  32.8   
1         44.6          14.0         6.86        6.3          232  20.4  31.4   
2         42.9          14.0         4.57        6.2          336  30.6  32.6   
3         41.9          14.4         4.67        3.5          276  30.8  34.4   
4         40.6          13.3         4.85       14.9          711  27.4  32.8   

    MCV  AGE SEX  SOURCE  
0  80.9   33   F       1  
1  65.0   36   M       0  
2  93.9   70   F       0  
3  89.7   18   F       0  
4  83.7   36   M       0  


In [74]:
# Select the columns to use as features (independent variables)
# Feature selection and engineering
X = data[['HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 'THROMBOCYTE', 'MCH', 'MCHC']]
# Select the target variable
y = data['SOURCE']

In [75]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [76]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [77]:
# Define a range of K values to test
k_values = range(1, 10)  # Test K values from 1 to 20

In [78]:
best_k = None
best_score = float('inf')  # Initialize with a high value for MSE

In [79]:
# Iterate through K values and find the best K
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    if mse < best_score:
        best_score = mse
        best_k = k


In [80]:
print(f"Best K: {best_k}")
print(f"Lowest MSE: {best_score}")

Best K: 9
Lowest MSE: 0.20288314497780766


In [81]:
# Assuming you have already trained your KNN model with best K
best_k = 9


In [88]:
new_data = pd.DataFrame({
    'HAEMATOCRIT': [33.8],     # Actual value from your dataset
    'HAEMOGLOBINS': [11.1],    # Actual value from your dataset
    'ERYTHROCYTE': [4.18],     # Actual value from your dataset
    'LEUCOCYTE': [4.6],       # Actual value from your dataset
    'THROMBOCYTE': [150],     # Actual value from your dataset
    'MCH': [26.6],            # Actual value from your dataset
    'MCHC': [32.8]            # Actual value from your dataset
})

In [89]:
# Standardize the new data using statistics from your training data
new_data_mean = X_train.mean()
new_data_std = X_train.std()
new_data_scaled = (new_data - new_data_mean) / new_data_std

In [90]:
# Create a KNN regressor with the best K
knn = KNeighborsRegressor(n_neighbors=best_k)

In [91]:
# Fit the model with the training data (X_train, y_train)
knn.fit(X_train, y_train)

In [92]:
# Make predictions for the new data
predictions = knn.predict(new_data_scaled)



In [93]:
# The 'predictions' variable now contains the predicted 'SOURCE' values
print(predictions)

[0.77777778]


In [94]:
#In this KNN regression analysis, the model assesses the patient's probability of being under medical care or not, taking into account their medical indicators. The prediction suggests that there's roughly a 77.78% chance that the patient is receiving medical care, making it a valuable tool for healthcare risk evaluation.