In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from pymongo import MongoClient

In [2]:
# --- 1. Koneksi ke MongoDB ---
client = MongoClient("mongodb://localhost:27017/")
db = client["staff_db"]
df = pd.DataFrame(list(db["cleaned_data"].find()))

# --- 2. Bersihkan Data ---
if '_id' in df.columns:
    df.drop('_id', axis=1, inplace=True)

df.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,3,1,1,1,3,28,3,5,0,0,85,1
1,5,1,0,0,1,58,4,25,1,0,67,1
2,2,1,0,1,1,22,3,3,1,0,83,1
3,0,1,1,1,1,29,3,6,1,0,94,1
4,0,0,1,1,1,35,5,10,1,0,47,1


In [3]:
# --- 3. Pisahkan fitur dan target ---
X = df.drop("is_promoted", axis=1)
y = df["is_promoted"]

In [4]:
# --- 4. Split dan normalisasi ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# --- 5. Inisialisasi model ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

# --- 6. Training dan evaluasi semua model ---
for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))


Model: Logistic Regression
              precision    recall  f1-score   support

           0     0.7451    0.7143    0.7294       847
           1     0.7253    0.7553    0.7400       846

    accuracy                         0.7348      1693
   macro avg     0.7352    0.7348    0.7347      1693
weighted avg     0.7352    0.7348    0.7347      1693


Model: Random Forest
              precision    recall  f1-score   support

           0     0.8345    0.7320    0.7799       847
           1     0.7611    0.8546    0.8051       846

    accuracy                         0.7933      1693
   macro avg     0.7978    0.7933    0.7925      1693
weighted avg     0.7978    0.7933    0.7925      1693


Model: XGBoost
              precision    recall  f1-score   support

           0     0.8526    0.7580    0.8025       847
           1     0.7819    0.8688    0.8231       846

    accuracy                         0.8133      1693
   macro avg     0.8173    0.8134    0.8128      1693
weighted

In [5]:
# --- 5. Inisialisasi dan latih XGBoost ---
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train_scaled, y_train)

# --- 6. Evaluasi model ---
y_pred = model.predict(X_test_scaled)
print("\n=== Hasil Evaluasi XGBoost ===")
print(classification_report(y_test, y_pred, digits=4))


=== Hasil Evaluasi XGBoost ===
              precision    recall  f1-score   support

           0     0.8526    0.7580    0.8025       847
           1     0.7819    0.8688    0.8231       846

    accuracy                         0.8133      1693
   macro avg     0.8173    0.8134    0.8128      1693
weighted avg     0.8173    0.8133    0.8128      1693



In [6]:
print(list(X.columns))

['department', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']


In [7]:
# --- 7. Prediksi data baru ---
sample = np.array([[1, 1, 0, 1, 2, 20, 2, 5, 1, 0, 90]])
sample_df = pd.DataFrame(sample, columns=X.columns)
sample_scaled = scaler.transform(sample_df)
pred = model.predict(sample_scaled)

print("\nPrediksi:", "Di Promosi" if pred[0] == 1 else "Tidak Di Promosi")


Prediksi: Di Promosi


In [9]:
import joblib

joblib.dump(model, "model_svm.pkl")

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']