In [1]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

print("Library berhasil diimpor.")

Library berhasil diimpor.


In [2]:
# Ganti dengan kredensial database PostgreSQL Anda
DB_USER = "postgres"
DB_PASSWORD = "DBmiko"
DB_NAME = "dbexam"
DB_HOST = "localhost"
DB_PORT = "5432"

db_engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Kueri untuk mengambil data dan menghitung rata-rata skor historis per mahasiswa
sql_query = """
SELECT
    e.enroll_id,
    e.stu_id,
    s.gender,
    d.dept_name,
    c.course_name,
    ass.assessment_type,
    ass.score,
    a.attendance_percentage,
    -- Menghitung rata-rata skor semua assessment sebelumnya untuk mahasiswa ini
    AVG(ass.score) OVER (PARTITION BY e.stu_id ORDER BY e.semester_id, e.course_id) as historical_avg_score
FROM enrollment e
JOIN student s ON e.stu_id = s.stu_id
JOIN department d ON s.dept_id = d.dept_id
JOIN course c ON e.course_id = c.course_id
JOIN assessment ass ON e.enroll_id = ass.enroll_id
JOIN attendance a ON e.enroll_id = a.enroll_id;
"""
df = pd.read_sql_query(sql_query, db_engine)

print(f"Data berhasil dimuat. Jumlah baris: {len(df)}")
df.head()

Data berhasil dimuat. Jumlah baris: 6315


Unnamed: 0,enroll_id,stu_id,gender,dept_name,course_name,assessment_type,score,attendance_percentage,historical_avg_score
0,1,1,Female,Information Technology,Course 1,Final,100,65,79.0
1,1,1,Female,Information Technology,Course 1,Midterm,59,65,79.0
2,1,1,Female,Information Technology,Course 1,Project,78,65,79.0
3,2,1,Female,Information Technology,Course 2,Midterm,71,66,72.833333
4,2,1,Female,Information Technology,Course 2,Final,64,66,72.833333


In [3]:
print("Memulai Feature Engineering...")

# 1. Skor Lompatan Drastis
# Jika historical_avg_score nol (mahasiswa baru), lompatannya dianggap 0
df['score_jump'] = df['score'] - df['historical_avg_score'].fillna(df['score'])

# 2. Penyimpangan dari Rata-rata Kelas
# Hitung rata-rata skor per assessment di setiap course
class_avg = df.groupby(['course_name', 'assessment_type'])['score'].transform('mean')
class_std = df.groupby(['course_name', 'assessment_type'])['score'].transform('std').fillna(1)
df['z_score_deviation'] = (df['score'] - class_avg) / class_std

# 3. Fitur Biner: Nilai Tinggi, Kehadiran Rendah
df['high_score_low_attendance'] = ((df['score'] >= 90) & (df['attendance_percentage'] < 60)).astype(int)

# Pilih fitur-fitur yang akan digunakan untuk model
features_for_model = [
    'score',
    'attendance_percentage',
    'historical_avg_score',
    'score_jump',
    'z_score_deviation',
    'high_score_low_attendance'
]

X = df[features_for_model].fillna(0) # Isi nilai NaN yang mungkin ada dengan 0

print("Feature Engineering selesai.")
X.head()

Memulai Feature Engineering...
Feature Engineering selesai.


Unnamed: 0,score,attendance_percentage,historical_avg_score,score_jump,z_score_deviation,high_score_low_attendance
0,100,65,79.0,21.0,1.546735,0
1,59,65,79.0,-20.0,-0.527182,0
2,78,65,79.0,-1.0,0.460883,0
3,71,66,72.833333,-1.833333,0.093918,0
4,64,66,72.833333,-8.833333,-0.276212,0


In [4]:
# Inisialisasi Scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Inisialisasi dan latih model Isolation Forest
# contamination='auto' adalah pilihan yang baik untuk memulai
model = IsolationForest(contamination='auto', random_state=42)
model.fit(X_scaled)

print("Model berhasil dilatih.")

Model berhasil dilatih.


In [5]:
# Definisikan path
MODEL_PATH = '../../ml_models/miko_fraud_detection_model.pkl'
SCALER_PATH = '../../ml_models/miko_fraud_detection_scaler.pkl'

# Simpan model
with open(MODEL_PATH, 'wb') as f:
    pickle.dump(model, f)
print(f"Model berhasil disimpan di: {MODEL_PATH}")

# Simpan scaler
with open(SCALER_PATH, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler berhasil disimpan di: {SCALER_PATH}")

Model berhasil disimpan di: ../../ml_models/miko_fraud_detection_model.pkl
Scaler berhasil disimpan di: ../../ml_models/miko_fraud_detection_scaler.pkl


In [6]:
print("📌 Contoh mahasiswa yang terdeteksi anomali:")
anomali_mahasiswa = df[df['anomaly_flag'] == -1]
display(anomali_mahasiswa[['stu_id', 'score', 'attendance_percentage', 'score_jump', 'z_score_deviation', 'anomaly_score_normalized']].sort_values(by='anomaly_score_normalized', ascending=True).head(10))


📌 Contoh mahasiswa yang terdeteksi anomali:


KeyError: 'anomaly_flag'