In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from pymongo import MongoClient

In [2]:
# --- 1. Koneksi ke MongoDB ---
client = MongoClient("mongodb://localhost:27017/")
db = client["staff_db"]
df = pd.DataFrame(list(db["cleaned_data"].find()))

# --- 2. Bersihkan Data ---
if '_id' in df.columns:
    df.drop('_id', axis=1, inplace=True)

df.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,0,1,0,0,1,35,5,3,1,0,50,1
1,1,0,0,0,1,42,5,11,1,0,60,1
2,0,1,0,1,1,40,3,2,1,0,90,1
3,5,1,0,1,1,27,4,4,1,0,72,1
4,2,1,0,1,2,34,5,8,0,0,88,1


In [3]:
# --- 3. Pisahkan fitur dan target ---
X = df.drop("is_promoted", axis=1)
y = df["is_promoted"]

In [4]:
# --- 4. Split dan normalisasi ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# --- 5. Inisialisasi model ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

# --- 6. Training dan evaluasi semua model ---
for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))


Model: Logistic Regression
              precision    recall  f1-score   support

           0     0.7230    0.6951    0.7088       492
           1     0.7414    0.7665    0.7537       561

    accuracy                         0.7331      1053
   macro avg     0.7322    0.7308    0.7313      1053
weighted avg     0.7328    0.7331    0.7327      1053


Model: Random Forest
              precision    recall  f1-score   support

           0     0.8140    0.7114    0.7592       492
           1     0.7721    0.8574    0.8125       561

    accuracy                         0.7892      1053
   macro avg     0.7930    0.7844    0.7859      1053
weighted avg     0.7916    0.7892    0.7876      1053


Model: XGBoost
              precision    recall  f1-score   support

           0     0.8194    0.7195    0.7662       492
           1     0.7778    0.8610    0.8173       561

    accuracy                         0.7949      1053
   macro avg     0.7986    0.7902    0.7917      1053
weighted

In [5]:
# --- 5. Inisialisasi dan latih XGBoost ---
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train_scaled, y_train)

# --- 6. Evaluasi model ---
y_pred = model.predict(X_test_scaled)
print("\n=== Hasil Evaluasi XGBoost ===")
print(classification_report(y_test, y_pred, digits=4))


=== Hasil Evaluasi XGBoost ===
              precision    recall  f1-score   support

           0     0.8194    0.7195    0.7662       492
           1     0.7778    0.8610    0.8173       561

    accuracy                         0.7949      1053
   macro avg     0.7986    0.7902    0.7917      1053
weighted avg     0.7972    0.7949    0.7934      1053



In [6]:
print(list(X.columns))

['department', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']


In [8]:
# --- 7. Prediksi data baru ---
sample = np.array([[1, 1, 0, 1, 2, 20, 2, 5, 1, 0, 90]])
sample_df = pd.DataFrame(sample, columns=X.columns)
sample_scaled = scaler.transform(sample_df)
pred = model.predict(sample_scaled)

print("\nPrediksi:", "Di Promosi" if pred[0] == 1 else "Tidak Di Promosi")


Prediksi: Di Promosi


In [9]:
import joblib

joblib.dump(model, "model_xgboost.pkl")

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']