In [17]:
# Import library yang diperlukan
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# 1. Load data
data = pd.read_csv('datasets-jadi.csv') 


In [3]:
# 2. Praproses Data
# Hapus kolom yang tidak diperlukan
data_cleaned = data.drop(columns=["NO", "NAMA"]).dropna()

In [4]:
# Encode kolom kategori menjadi numerik
label_encoders = {}
for column in data_cleaned.columns:
    if data_cleaned[column].dtype == 'object':
        le = LabelEncoder()
        data_cleaned[column] = le.fit_transform(data_cleaned[column])
        label_encoders[column] = le

In [5]:
# Pisahkan fitur dan target
X = data_cleaned.drop(columns="PE/Non PE")
y = data_cleaned["PE/Non PE"]

In [6]:
# Bagi data menjadi train dan test (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
# 3. Modeling dengan Naive Bayes
# Melatih model Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

GaussianNB()

In [8]:
# Prediksi dan evaluasi Naive Bayes
y_pred_nb = nb_model.predict(X_test)
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)

In [9]:
print("Naive Bayes Confusion Matrix:\n", conf_matrix_nb)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", classification_report_nb)


Naive Bayes Confusion Matrix:
 [[  1 152]
 [  3  13]]
Naive Bayes Accuracy: 0.08284023668639054
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.01      0.01       153
           1       0.08      0.81      0.14        16

    accuracy                           0.08       169
   macro avg       0.16      0.41      0.08       169
weighted avg       0.23      0.08      0.03       169



In [10]:
# 4. Seleksi Fitur Menggunakan Information Gain
# Hitung Information Gain untuk setiap fitur
info_gain = mutual_info_classif(X, y)
info_gain_series = pd.Series(info_gain, index=X.columns).sort_values(ascending=False)


In [11]:
# Pilih fitur terbaik berdasarkan Information Gain
top_features = info_gain_series.head(5).index  # Misalnya, pilih 5 fitur teratas
X_top_features = X[top_features]

In [12]:
# Bagi ulang data dengan fitur terpilih
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top_features, y, test_size=0.3, random_state=42)


In [18]:
# 5. Modeling dan Evaluasi dengan KNN dan Decision Tree
# Model KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_top, y_train_top)
y_pred_knn = knn_model.predict(X_test_top)
accuracy_knn = accuracy_score(y_test_top, y_pred_knn)


In [19]:
# Model Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_top, y_train_top)
y_pred_dt = dt_model.predict(X_test_top)
accuracy_dt = accuracy_score(y_test_top, y_pred_dt)


In [20]:
# 6. Hasil dan Perbandingan Akurasi
print("Naive Bayes Accuracy:", accuracy_nb)
print("KNN Accuracy:", accuracy_knn)
print("Decision Tree Accuracy:", accuracy_dt)
print("\nInformation Gain per Feature:\n", info_gain_series)

Naive Bayes Accuracy: 0.08284023668639054
KNN Accuracy: 0.9053254437869822
Decision Tree Accuracy: 0.9230769230769231

Information Gain per Feature:
 RIW HIPERTENSI                      0.105071
RIW PE                              0.086013
JARAK KELAHIRAN                     0.015420
RIW HIPERTENSI/PE DALAM KELUARGA    0.011479
USIA                                0.000000
PARITAS                             0.000000
OBESITAS                            0.000000
RIW DM                              0.000000
SOSEK RENDAH                        0.000000
dtype: float64
