In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv("HAM10000_metadata_cleaned.csv")

# Encode categorical columns
df_encoded = df.copy()
categorical_cols = ['Patient_Gender', 'Body_Location', 'Diagnosis_Label']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    encoders[col] = le

# Encode the target 'dx'
from sklearn.preprocessing import LabelEncoder
target_le = LabelEncoder()
df_encoded['Diagnosis_Label'] = target_le.fit_transform(df_encoded['Diagnosis_Label'].astype(str))


In [11]:
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object' and col not in ['Image_ID', 'Lesion_ID']:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])

In [13]:
X = df_encoded.drop(columns=['Image_ID', 'Lesion_ID', 'Diagnosis_Label'])
y = df_encoded['Diagnosis_Label']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)  


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_le.classes_))

Accuracy: 0.7264103844233649
              precision    recall  f1-score   support

           0       0.24      0.09      0.13        69
           1       0.29      0.24      0.26        93
           2       0.48      0.41      0.44       228
           3       0.73      0.29      0.41        28
           4       0.84      0.94      0.89      1338
           5       0.34      0.27      0.30       226
           6       0.47      0.33      0.39        21

    accuracy                           0.73      2003
   macro avg       0.48      0.37      0.40      2003
weighted avg       0.69      0.73      0.70      2003



In [19]:
pip install imbalanced-learn xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/150.0 MB 2.2 MB/s eta 0:01:07
   ---------------------------------------- 1.3/150.0 MB 2.2 MB/s eta 0:01:07
   ---------------------------------------- 1.8/150.0 MB 2.2 MB/s eta 0:01:08
    --------------------------------------- 2.1/150.0 MB 2.1 MB/s eta 0:01:11
    --------------------------------------- 2.6/150.0 MB 2.1 MB/s eta 0:01:11
    --------------------------------------- 3.1/150.0 MB 2.1 MB/s eta 0:01:10
    --------------------------------------- 3.4/150.0 MB 2.1 MB/s eta 0:01:10
   - -------------------------------------- 3.9/150.0 MB 2.1 MB/s eta 0:01:09
   - -------------------------------------- 4.5/150.0 MB 2.1 MB/s eta 0:01:08
   - --

In [31]:
print(X.dtypes)

Diagnosis_Method      int32
Patient_Age         float64
Patient_Gender        int32
Body_Location         int32
dtype: object


In [41]:
from sklearn.preprocessing import LabelEncoder

# Make a new encoded DataFrame from original df
df_encoded = df.copy()
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object' and col not in ['Image_ID', 'Lesion_ID']:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))



In [43]:
df_encoded = df_encoded.dropna()

X = df_encoded.drop(columns=['Image_ID', 'Lesion_ID', 'Diagnosis_Label'])
y = df_encoded['Diagnosis_Label']

In [45]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [49]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [51]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_le.classes_))

Accuracy: 0.7254118821767349
              precision    recall  f1-score   support

           0       0.21      0.09      0.12        69
           1       0.30      0.18      0.23        93
           2       0.46      0.40      0.43       228
           3       0.70      0.25      0.37        28
           4       0.82      0.95      0.88      1338
           5       0.37      0.25      0.30       226
           6       0.54      0.33      0.41        21

    accuracy                           0.73      2003
   macro avg       0.49      0.35      0.39      2003
weighted avg       0.68      0.73      0.70      2003

