In [3]:
# 1️⃣ Import Library
import os
import re
import pickle
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# 2️⃣ Load Dataset
DATA_PATH = "backend/data/Dataset.xlsx"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"❌ Dataset tidak ditemukan di path: {DATA_PATH}")

df = pd.read_excel(DATA_PATH)

print("Jumlah data:", len(df))
print("Kolom:", list(df.columns))
print(df.head(3))


Jumlah data: 50
Kolom: ['ID', 'Nama_Penyakit', 'Kategori', 'Gejala', 'Bobot_Probabilitas', 'Tingkat_Keparahan', 'Obat_Cocok', 'Rekomendasi_Mandiri', 'Kapan_ke_Dokter', 'prevalence_score', 'Probabilitas_Penyakit']
     ID                Nama_Penyakit    Kategori  \
0  P001                    Flu Biasa  Pernapasan   
1  P002              Influenza (Flu)  Pernapasan   
2  P003  Demam Berdarah Dengue (DBD)     Infeksi   

                                              Gejala  \
0  demam ringan; pilek; batuk; sakit tenggorokan;...   
1  demam tinggi mendadak; nyeri otot hebat; sakit...   
2  demam tinggi; nyeri sendi; sakit kepala; nyeri...   

               Bobot_Probabilitas Tingkat_Keparahan  \
0     0.25, 0.25, 0.2, 0.15, 0.15            Ringan   
1       0.3, 0.2, 0.15, 0.2, 0.15            Sedang   
2  0.25, 0.2, 0.2, 0.15, 0.1, 0.1            Serius   

                                 Obat_Cocok  \
0                  Parasetamol, Dekongestan   
1  Oseltamivir (jika indikasi), Parase

In [5]:
# 3️⃣ Preprocessing Text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["Gejala"].apply(clean_text)

# gunakan kolom Gejala untuk fitur dan Nama_Penyakit sebagai label
X = df["clean_text"]
y = df["Nama_Penyakit"]

In [6]:
# 4️⃣ Split Dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Data train: {len(X_train)}, Data test: {len(X_test)}")

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# 5️⃣ TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# 6️⃣ Train Naive Bayes
print("\n=== Naive Bayes ===")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

print("Akurasi Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

In [None]:
# 7️⃣ Train Random Forest
print("\n=== Random Forest ===")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)

print("Akurasi Random Forest:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# 8️⃣ Simpan Model & Vectorizer
MODEL_DIR = "../backend/model"
os.makedirs(MODEL_DIR, exist_ok=True)

with open(f"{MODEL_DIR}/naive_bayes_model.pkl", "wb") as f:
    pickle.dump(nb_model, f)

with open(f"{MODEL_DIR}/random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open(f"{MODEL_DIR}/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("\n✅ Semua model berhasil disimpan ke folder backend/model/")