In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
# โหลดข้อมูล
df = pd.read_csv('/content/diabetes_dataset.csv')

In [None]:
# ตรวจสอบข้อมูล
print(df.head())
print(df.info())

   year  gender   age location  race:AfricanAmerican  race:Asian  \
0  2020  Female  32.0  Alabama                     0           0   
1  2015  Female  29.0  Alabama                     0           1   
2  2015    Male  18.0  Alabama                     0           0   
3  2015    Male  41.0  Alabama                     0           0   
4  2016  Female  52.0  Alabama                     1           0   

   race:Caucasian  race:Hispanic  race:Other  hypertension  heart_disease  \
0               0              0           1             0              0   
1               0              0           0             0              0   
2               0              0           1             0              0   
3               1              0           0             0              0   
4               0              0           0             0              0   

  smoking_history    bmi  hbA1c_level  blood_glucose_level  diabetes  
0           never  27.32          5.0                  10

In [None]:
# ตรวจสอบค่าที่หายไปใน DataFrame
missing_values = df.isna().sum()

# แสดงผลลัพธ์ที่มีค่าที่หายไป
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [None]:
# สร้าง LabelEncoders สำหรับแต่ละคอลัมน์ที่ต้องการแปลง
le_gender = LabelEncoder()
le_location = LabelEncoder()
le_smoking_history = LabelEncoder()
le_age = LabelEncoder()
le_bmi = LabelEncoder()
le_hbA1c_level = LabelEncoder()

# แปลงคอลัมน์ให้เป็นตัวเลข
df['gender'] = le_gender.fit_transform(df['gender'])
df['location'] = le_location.fit_transform(df['location'])
df['smoking_history'] = le_smoking_history.fit_transform(df['smoking_history'])
df['age'] = le_age.fit_transform(df['age'])
df['bmi'] = le_bmi.fit_transform(df['bmi'])
df['hbA1c_level'] = le_hbA1c_level.fit_transform(df['hbA1c_level'])
# ตรวจสอบประเภทข้อมูลหลังจากแปลง
print(df.dtypes)

year                    int64
gender                  int64
age                     int64
location                int64
race:AfricanAmerican    int64
race:Asian              int64
race:Caucasian          int64
race:Hispanic           int64
race:Other              int64
hypertension            int64
heart_disease           int64
smoking_history         int64
bmi                     int64
hbA1c_level             int64
blood_glucose_level     int64
diabetes                int64
dtype: object


In [None]:
# สร้าง LabelEncoder สำหรับแต่ละคอลัมน์
le_gender = LabelEncoder()
le_location = LabelEncoder()
le_smoking_history = LabelEncoder()
le_age = LabelEncoder()
le_bmi = LabelEncoder()
le_hbA1c_level = LabelEncoder()

# แปลงข้อมูลหมวดหมู่ให้เป็นข้อมูลตัวเลขในแต่ละคอลัมน์
df['Gender'] = le_gender.fit_transform(df['gender'])
df['Location'] = le_location.fit_transform(df['location'])
df['Smoking_History'] = le_smoking_history.fit_transform(df['smoking_history'])
df['age'] = le_age.fit_transform(df['age'])
df['bmi'] = le_bmi.fit_transform(df['bmi'])
df['hbA1c_level'] = le_hbA1c_level.fit_transform(df['hbA1c_level'])

# แสดงข้อมูลที่ถูกแปลงแล้ว
df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,Gender,Location,Smoking_History
0,2020,0,53,0,0,0,0,0,1,0,0,4,1547,4,100,0,0,0,4
1,2015,0,50,0,0,1,0,0,0,0,0,4,810,4,90,0,0,0,4
2,2015,1,39,0,0,0,0,0,1,0,0,4,1191,3,160,0,1,0,4
3,2015,1,62,0,0,0,1,0,0,0,0,4,1547,1,159,0,1,0,4
4,2016,0,73,0,1,0,0,0,0,0,0,4,1190,10,90,0,0,0,4


In [None]:
# เลือก features (ทุกคอลัมน์ยกเว้น heart_disease)
X = df.drop(columns=['heart_disease'])

In [None]:
# เลือก target (heart_disease)
y = df['heart_disease']

In [None]:
# แบ่งข้อมูลสำหรับการฝึกและทดสอบ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ตรวจสอบว่ามีคอลัมน์ใดบ้างที่เป็น categorical
print(df.select_dtypes(include=['object']).columns)

# แปลงค่าที่เป็น categorical ให้เป็นตัวเลขโดยใช้ One-Hot Encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# แยก features และ target ใหม่
X = df_encoded.drop(columns=['heart_disease'])
y = df_encoded['heart_disease']

# แบ่งข้อมูลสำหรับการฝึกและทดสอบ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ปรับสเกลข้อมูล (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Index([], dtype='object')


In [None]:
# Scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# Create the hyperparameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

# Create the hyperparameter grid for Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Naive Bayes ไม่มี hyperparameters ที่ซับซ้อน
nb = GaussianNB()

# Instantiate the models
logreg = LogisticRegression()
dt = DecisionTreeClassifier()

# Instantiate the GridSearchCV objects for each model
grid_search_lr = GridSearchCV(estimator=logreg, param_grid=param_grid_lr, cv=5, n_jobs=-1,
                              verbose=2, scoring=scoring, refit='accuracy')

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1,
                              verbose=2, scoring=scoring, refit='accuracy')

# Fit the GridSearchCV objects
grid_search_lr.fit(X_train, y_train)
grid_search_dt.fit(X_train, y_train)

# Fit Naive Bayes (ไม่มี GridSearch)
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = grid_search_lr.predict(X_test)
y_pred_dt = grid_search_dt.predict(X_test)
y_pred_nb = nb.predict(X_test)

# Display the best parameters and classification report for each model
print("_____________________________________________________________", '\n')
print('Logistic Regression Best Parameters:', grid_search_lr.best_params_)
print('Logistic Regression Classification Report:\n',
      classification_report(y_test, y_pred_lr, zero_division=1))  # เพิ่ม zero_division=1
print("_____________________________________________________________", '\n')
print('Decision Tree Best Parameters:', grid_search_dt.best_params_)
print('Decision Tree Classification Report:\n',
      classification_report(y_test, y_pred_dt, zero_division=1))  # เพิ่ม zero_division=1
print("_____________________________________________________________", '\n')
print('Naive Bayes Classification Report:\n',
      classification_report(y_test, y_pred_nb, zero_division=1))  # เพิ่ม zero_division=1
print("_____________________________________________________________", '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
_____________________________________________________________ 

Logistic Regression Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     19187
           1       1.00      0.00      0.00       813

    accuracy                           0.96     20000
   macro avg       0.98      0.50      0.49     20000
weighted avg       0.96      0.96      0.94     20000

_____________________________________________________________ 

Decision Tree Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 10}
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     19187
           1       1.00      0.00      0.00  

In [None]:
import joblib
import os
# สร้างไดเรกทอรี 'models/'
os.makedirs('models', exist_ok=True)

# Dictionary สำหรับเก็บโมเดล
models = {
    'logistic_regression': grid_search_lr,
    'decision_tree': grid_search_dt,
    'naive_bayes': nb
}
# บันทึกโมเดลและข้อมูลเพิ่มเติม
for model_name, model in models.items():
    if model_name != 'naive_bayes':
        # สำหรับ Logistic Regression และ Decision Tree ใช้ best_estimator_ จาก GridSearchCV
        best_model = model.best_estimator_
    else:
        # สำหรับ Naive Bayes ไม่มี GridSearch ใช้โมเดลที่ฟิตแล้วโดยตรง
        best_model = model

    # บันทึกโมเดล
    joblib.dump(best_model, f'models/best_{model_name}.pkl')

    # บันทึก mapping
    joblib.dump({
        'gender': le_gender.classes_,
        'location': le_location.classes_,
        'smoking_history': le_smoking_history.classes_,
        'age': le_age.classes_,
        'bmi': le_bmi.classes_,
        'hbA1c_level': le_hbA1c_level.classes_
    }, 'models/mapping.pkl')

    # บันทึกชื่อคอลัมน์
    joblib.dump(X.columns.tolist(), 'models/columns.pkl')

    print(f"Saved best {model_name} model to 'models/best_{model_name}.pkl'")

Saved best logistic_regression model to 'models/best_logistic_regression.pkl'
Saved best decision_tree model to 'models/best_decision_tree.pkl'
Saved best naive_bayes model to 'models/best_naive_bayes.pkl'


In [None]:
df.to_csv('Preproses_diabetes_dataset.csv', index=False)