In [5]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

class ObesityModelTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.label_encoder = None
        self.scaler = None
        self.feature_names = None
        self.model = None
        self.target = "NObeyesdad"
        self.numerical = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
        self.categorical = [
            'Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC',
            'CAEC', 'CALC', 'MTRANS'
        ]

    def load_data(self):
        df = pd.read_csv(self.data_path)
        # Anomali age: "44 years" -> 44
        def clean_age(x):
            if isinstance(x, str):
                if 'year' in x:
                    return int(x.split()[0])
                elif x.isdigit():
                    return int(x)
                else:
                    return np.nan
            elif pd.isna(x):
                return np.nan
            else:
                return int(x)
        df['Age'] = df['Age'].apply(clean_age)
        # Drop duplikat
        df = df.drop_duplicates().reset_index(drop=True)
        self.df = df
        print("Data loaded, age cleaned, duplicates dropped.")

    def handle_missing(self):
        df = self.df
        # Impute numerik
        for col in self.numerical:
            if df[col].isnull().sum() > 0:
                skew = df[col].skew()
                fill_value = df[col].median() if abs(skew) > 1 else df[col].mean()
                df[col] = df[col].fillna(fill_value)
        # Impute kategorikal
        for col in self.categorical + [self.target]:
            if df[col].isnull().sum() > 0:
                mode_val = df[col].mode().iloc[0]
                df[col] = df[col].fillna(mode_val)
        # Drop sisa missing (jika ada)
        df = df.dropna().reset_index(drop=True)
        self.df = df
        print("Missing values handled.")

    def handle_categorical(self):
        df = self.df
        # Gabung minor kategori
        df['CAEC'] = df['CAEC'].replace({'Always': 'Often', 'Frequently': 'Often', 'no': 'Never'})
        df['CALC'] = df['CALC'].replace({'Frequently': 'Sometimes'})
        df['MTRANS'] = df['MTRANS'].replace({'Walking': 'Other', 'Bike': 'Other', 'Motorbike': 'Other'})
        self.df = df
        print("Minor kategori digabung.")

    def encode_and_scale(self):
        df = self.df.copy()
        # One-hot untuk fitur kategorikal
        df = pd.get_dummies(df, columns=self.categorical, drop_first=True)
        # Label encoding target
        le = LabelEncoder()
        df[self.target] = le.fit_transform(df[self.target])
        self.label_encoder = le
        # Pisah X, y
        X = df.drop(columns=[self.target])
        y = df[self.target]
        # Train-test split stratified
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        # Scaling numerik saja
        scaler = StandardScaler()
        X_train.loc[:, self.numerical] = scaler.fit_transform(X_train[self.numerical])
        X_test.loc[:, self.numerical] = scaler.transform(X_test[self.numerical])
        self.scaler = scaler
        self.X_train, self.X_test = X_train, X_test
        self.y_train, self.y_test = y_train, y_test
        self.feature_names = X_train.columns.tolist()
        print("Encoded & scaled.")

    def train(self):
        # Param terbaik hasil gridsearch
        self.model = RandomForestClassifier(
            class_weight='balanced',
            max_depth=None,
            min_samples_leaf=1,
            min_samples_split=2,
            n_estimators=200,
            random_state=42
        )
        self.model.fit(self.X_train, self.y_train)
        print("Model trained.")

    def evaluate(self):
        y_pred = self.model.predict(self.X_test)
        print("Accuracy:", accuracy_score(self.y_test, y_pred))
        print("Precision:", precision_score(self.y_test, y_pred, average='macro'))
        print("Recall:", recall_score(self.y_test, y_pred, average='macro'))
        print("F1-score:", f1_score(self.y_test, y_pred, average='macro'))
        print(classification_report(self.y_test, y_pred))

    def save(self, path="best_model.pkl"):
        with open(path, "wb") as f:
            pickle.dump({
                "model": self.model,
                "scaler": self.scaler,
                "label_encoder": self.label_encoder,
                "feature_names": self.feature_names,
                "numerical": self.numerical
            }, f)
        print(f"Model & pipeline saved to {path}")

# --- Training ---
if __name__ == "__main__":
    trainer = ObesityModelTrainer("ObesityDataSet1.csv")
    trainer.load_data()
    trainer.handle_missing()
    trainer.handle_categorical()
    trainer.encode_and_scale()
    trainer.train()
    trainer.evaluate()
    trainer.save()


Data loaded, age cleaned, duplicates dropped.
Missing values handled.
Minor kategori digabung.
Encoded & scaled.
Model trained.
Accuracy: 0.9523809523809523
Precision: 0.951204533716849
Recall: 0.9501993901008678
F1-score: 0.9501654138128434
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        27
           1       0.89      0.86      0.88        29
           2       0.95      1.00      0.97        35
           3       1.00      0.97      0.98        30
           4       1.00      1.00      1.00        32
           5       0.92      0.86      0.89        28
           6       0.97      0.97      0.97        29

    accuracy                           0.95       210
   macro avg       0.95      0.95      0.95       210
weighted avg       0.95      0.95      0.95       210

Model & pipeline saved to best_model.pkl
