
#Import the necessary libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data

In [4]:
train_data = pd.read_csv('/content/Train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')


# Display data for verification

In [5]:
print(train_data.head())
print(test_data.head())

    Glucose  Cholesterol  Hemoglobin  Platelets  White Blood Cells  \
0  0.739597     0.650198    0.713631   0.868491           0.687433   
1  0.121786     0.023058    0.944893   0.905372           0.507711   
2  0.452539     0.116135    0.544560   0.400640           0.294538   
3  0.136609     0.015605    0.419957   0.191487           0.081168   
4  0.176737     0.752220    0.971779   0.785286           0.443880   

   Red Blood Cells  Hematocrit  Mean Corpuscular Volume  \
0         0.529895    0.290006                 0.631045   
1         0.403033    0.164216                 0.307553   
2         0.382021    0.625267                 0.295122   
3         0.166214    0.073293                 0.668719   
4         0.439851    0.894991                 0.442159   

   Mean Corpuscular Hemoglobin  Mean Corpuscular Hemoglobin Concentration  \
0                     0.001328                                   0.795829   
1                     0.207938                                   0.505

# Ensure there are no missing values

In [6]:
print("Missing values in train data:")
print(train_data.isnull().sum())
print("Missing values in test data:")
print(test_data.isnull().sum())

Missing values in train data:
Glucose                                      0
Cholesterol                                  0
Hemoglobin                                   0
Platelets                                    0
White Blood Cells                            0
Red Blood Cells                              0
Hematocrit                                   0
Mean Corpuscular Volume                      0
Mean Corpuscular Hemoglobin                  0
Mean Corpuscular Hemoglobin Concentration    0
Insulin                                      0
BMI                                          0
Systolic Blood Pressure                      0
Diastolic Blood Pressure                     0
Triglycerides                                0
HbA1c                                        0
LDL Cholesterol                              0
HDL Cholesterol                              0
ALT                                          0
AST                                          0
Heart Rate                    

# Check for unique values ​​in 'Disease' column in both training and test sets

In [7]:
train_disease_unique = train_data['Disease'].unique()
test_disease_unique = test_data['Disease'].unique()
print("Unique values in 'Disease' column in train data:", train_disease_unique)
print("Unique values in 'Disease' column in test data:", test_disease_unique)

Unique values in 'Disease' column in train data: ['Healthy' 'Diabetes' 'Thalasse' 'Anemia' 'Thromboc']
Unique values in 'Disease' column in test data: ['Thalasse' 'Diabetes' 'Heart Di' 'Anemia' 'Thromboc' 'Healthy']


# Ensure that all unique values ​​in the test set are in the training set

In [8]:
new_disease_labels = set(test_disease_unique) - set(train_disease_unique)
if new_disease_labels:
    print("Warning: The following labels in the test data are not present in the training data:", new_disease_labels)



Encode column 'Disease' using LabelEncoder


In [9]:
label_encoder = LabelEncoder()
train_data['Disease'] = label_encoder.fit_transform(train_data['Disease'])


# Handle new values ​​in the test set
# We can ignore rows with unknown labels or assign their own value

In [10]:
known_labels = set(label_encoder.classes_)
def encode_disease(value):
    if value in known_labels:
        return label_encoder.transform([value])[0]
    else:
        return -1

In [11]:


test_data['Disease'] = test_data['Disease'].apply(encode_disease)


# Check unique values ​​after encoding

In [12]:
print("Unique values in 'Disease' column after encoding in train data:", train_data['Disease'].unique())
print("Unique values in 'Disease' column after encoding in test data:", test_data['Disease'].unique())

Unique values in 'Disease' column after encoding in train data: [2 1 3 0 4]
Unique values in 'Disease' column after encoding in test data: [ 3  1 -1  0  4  2]



# Remove rows containing unknown values ​​(-1)

In [13]:
test_data = test_data[test_data['Disease'] != -1]

# Separate data into Features and Target

In [14]:
X_train = train_data.drop('Disease', axis=1)
y_train = train_data['Disease']
X_test = test_data.drop('Disease', axis=1)
y_test = test_data['Disease']


# Train the model

In [15]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


# Prediction using the trained model

In [19]:
y_pred = model.predict(X_test)



# Calculate metrics

In [20]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Show results


In [21]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.5011185682326622
Precision: 0.5887324762491151
Recall: 0.5011185682326622
F1-score: 0.5288173422401611
