In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import re
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, balanced_accuracy_score,
    roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


warnings.filterwarnings('ignore')

# Set style untuk plotting
plt.style.use('default')
sns.set_palette("husl")

In [2]:
# Fungsi helper untuk basic info
def basic_info(df, filename):
    print(f"\n📊 {filename}")
    print("-" * 40)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")
    print(f"Missing values:\n{df.isnull().sum()}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024:.2f} KB")
    return df.head()

In [3]:
datasets = {}
file_names = {
    'absensi': 'MineToday Dataset/train/train_absensi.csv',
    'mini_project': 'MineToday Dataset/train/train_mini_project.csv', 
    'pendaftaran': 'MineToday Dataset/train/train_pendaftaran.csv',
    'pretest_ml': 'MineToday Dataset/train/train_pretest_ml.csv',
    'pretest_py': 'MineToday Dataset/train/train_pretest_py.csv',
    'pretest_st': 'MineToday Dataset/train/train_pretest_st.csv',
    'weekly_quiz': 'MineToday Dataset/train/train_weekly_quiz.csv'
}

In [4]:
def load_datasets():
    datasets = {}
    for name, path in file_names.items():
        try:
            datasets[name] = pd.read_csv(path)
            print(f"✅ {name}: {datasets[name].shape}")
        except FileNotFoundError:
            print(f"❌ {name}: File not found at {path}")
        except Exception as e:
            print(f"❌ {name}: Error loading - {e}")
    return datasets

In [5]:
# Analisis detail setiap dataset
for file_name, df in datasets.items():
    print(f"\n{'='*60}")
    sample_data = basic_info(df, file_name)
    print(f"\nSample data (first 3 rows):")
    print(sample_data.head(3))
    
    # Cek unique values untuk kolom kategorikal
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\n🏷️  Categorical columns unique values:")
        for col in categorical_cols[:5]:  # Limit to first 5 columns
            unique_vals = df[col].nunique()
            print(f"  {col}: {unique_vals} unique values")
            if unique_vals <= 10:
                print(f"    Values: {df[col].unique()[:10]}")

In [6]:
# Analisis ID peserta untuk join datasets
print(f"\n{'='*60}")
print("🔗 PARTICIPANT ID ANALYSIS")
print("-" * 40)

# Cari kolom yang mungkin berisi ID peserta
id_candidates = []
for file_name, df in datasets.items():
    for col in df.columns:
        if any(keyword in col.lower() for keyword in ['id', 'email', 'nama', 'timestamp']):
            id_candidates.append((file_name, col, df[col].nunique()))

print("Potential ID columns:")
for file_name, col, unique_count in id_candidates:
    print(f"  {file_name}: {col} ({unique_count} unique values)")



🔗 PARTICIPANT ID ANALYSIS
----------------------------------------
Potential ID columns:


In [7]:
# Cek timestamp patterns
print(f"\n⏰ TIMESTAMP ANALYSIS")
print("-" * 40)
for file_name, df in datasets.items():
    timestamp_cols = [col for col in df.columns if 'timestamp' in col.lower() or 'tanggal' in col.lower()]
    if timestamp_cols:
        print(f"\n{file_name}:")
        for col in timestamp_cols:
            print(f"  {col}: {df[col].dtype}")
            print(f"    Sample: {df[col].dropna().head(2).tolist()}")



⏰ TIMESTAMP ANALYSIS
----------------------------------------


In [8]:
# Summary statistics untuk numerical columns
print(f"\n{'='*60}")
print("📊 NUMERICAL COLUMNS SUMMARY")
print("-" * 40)

for file_name, df in datasets.items():
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if len(numerical_cols) > 0:
        print(f"\n{file_name}:")
        print(df[numerical_cols].describe().round(2))


📊 NUMERICAL COLUMNS SUMMARY
----------------------------------------


In [9]:
# Pattern analysis untuk potential target creation
print(f"\n{'='*60}")
print("🎯 POTENTIAL TARGET PATTERNS")
print("-" * 40)

# Analisis completion patterns
completion_indicators = []
for file_name, df in datasets.items():
    if 'absensi' in file_name:
        print(f"\n{file_name}:")
        if 'Pertemuan ke' in df.columns or 'Pertemuan ke-' in df.columns:
            pertemuan_col = 'Pertemuan ke' if 'Pertemuan ke' in df.columns else 'Pertemuan ke-'
            print(f"  Column: {pertemuan_col}")
            print(f"  Data type: {df[pertemuan_col].dtype}")
            print(f"  Unique values: {sorted(df[pertemuan_col].dropna().unique())}")
            print(f"  Missing values: {df[pertemuan_col].isnull().sum()}")
            
            # Convert to numeric untuk cari max
            try:
                numeric_values = pd.to_numeric(df[pertemuan_col], errors='coerce')
                max_pertemuan = numeric_values.max()
                print(f"  Max pertemuan: {max_pertemuan}")
                print(f"  Min pertemuan: {numeric_values.min()}")
                print(f"  Total attendances: {len(df)}")
            except Exception as e:
                print(f"  Error processing pertemuan: {e}")
            
    elif 'quiz' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total quiz records: {df.shape[0]}")
        # Cek jika ada kolom score/nilai
        score_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['score', 'nilai', 'point'])]
        if score_cols:
            for col in score_cols[:3]:  # Limit to first 3 score columns
                print(f"  {col}: mean={df[col].mean():.2f}, std={df[col].std():.2f}")
        
    elif 'mini_project' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total project submissions: {df.shape[0]}")
        # Cek jika ada link submissions
        link_cols = [col for col in df.columns if 'link' in col.lower() or 'url' in col.lower()]
        if link_cols:
            for col in link_cols:
                non_empty = df[col].dropna().shape[0]
                print(f"  {col}: {non_empty} non-empty submissions")
        
    elif 'pretest' in file_name:
        test_type = file_name.split('_')[-1].replace('.csv', '').upper()
        print(f"\n{file_name}:")
        print(f"  Pretest {test_type} participants: {df.shape[0]}")
        # Cek score columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            for col in numeric_cols[:3]:  # First 3 numeric columns
                print(f"  {col}: mean={df[col].mean():.2f}, range=[{df[col].min():.1f}, {df[col].max():.1f}]")
    
    elif 'pendaftaran' in file_name:
        print(f"\n{file_name}:")
        print(f"  Total registrations: {df.shape[0]}")
        if 'Status' in df.columns:
            status_counts = df['Status'].value_counts()
            print(f"  Status distribution: {dict(status_counts)}")


🎯 POTENTIAL TARGET PATTERNS
----------------------------------------


In [10]:
print(f"\n📈 CROSS-DATASET ANALYSIS")
print("-" * 40)

# Coba identifikasi common participants
participant_counts = {}
for file_name, df in datasets.items():
    # Cari kolom yang mungkin identifier
    for col in df.columns:
        if 'email' in col.lower() or 'nama' in col.lower():
            unique_participants = df[col].nunique()
            participant_counts[f"{file_name}_{col}"] = unique_participants
            print(f"{file_name} - {col}: {unique_participants} unique participants")

if participant_counts:
    print(f"\nParticipant overlap analysis needed for joining datasets")


📈 CROSS-DATASET ANALYSIS
----------------------------------------


In [11]:
print("🔄 DATA INTEGRATION")
print("="*50)

def parse_score(score_str):
    """Convert '80 / 100' to 0.8"""
    if pd.isna(score_str):
        return np.nan
    if isinstance(score_str, str) and '/' in score_str:
        try:
            num, den = score_str.split('/')
            return float(num.strip()) / float(den.strip())
        except:
            return np.nan
    return score_str

🔄 DATA INTEGRATION


In [12]:
def extract_meeting_number(meeting_str):
    """Extract number from 'Pertemuan 12'"""
    if pd.isna(meeting_str):
        return np.nan
    if isinstance(meeting_str, str):
        match = re.search(r'\d+', meeting_str)
        return int(match.group()) if match else np.nan
    return meeting_str

In [13]:
print("🛠️ FEATURE ENGINEERING")
print("="*50)


🛠️ FEATURE ENGINEERING


In [14]:
def create_attendance_features(df_absensi):
    print("\n📊 Processing Attendance Data...")
    
    # Parse meeting numbers dari 'Pertemuan ke' column
    df_absensi['meeting_num'] = df_absensi['Pertemuan ke'].apply(extract_meeting_number)
    
    # Aggregate per participant
    attendance_stats = df_absensi.groupby('id').agg({
        'meeting_num': ['count', 'max', 'min'],
        'Kualitas materi ': 'mean',
        'Kualitas trainer ': 'mean'
    }).round(3)
    
    # Flatten column names
    attendance_stats.columns = [
        'total_meetings_attended', 'highest_meeting', 'first_meeting',
        'avg_material_quality', 'avg_trainer_quality'
    ]
    
    # Calculate key metrics
    MAX_MEETINGS = 30  # Assuming 30 total meetings
    attendance_stats['attendance_rate'] = (
        attendance_stats['total_meetings_attended'] / MAX_MEETINGS
    ).clip(0, 1).round(3)
    
    # Engagement score (high attendance + good ratings)
    attendance_stats['engagement_score'] = (
        attendance_stats['attendance_rate'] * 0.6 + 
        (attendance_stats['avg_material_quality'] / 5) * 0.2 +
        (attendance_stats['avg_trainer_quality'] / 5) * 0.2
    ).round(3)
    
    return attendance_stats.reset_index()

In [15]:
def create_assessment_features(datasets):
    print("\n📊 Processing Assessment Scores...")
    
    assessment_features = []
    
    # Process each pretest
    for test_name in ['pretest_ml', 'pretest_py', 'pretest_st']:
        df = datasets[test_name].copy()
        
        # Parse scores
        df['score_normalized'] = df['Score'].apply(parse_score)
        
        # Get best attempt per participant
        best_scores = df.groupby('id')['score_normalized'].max().reset_index()
        best_scores.columns = ['id', f'{test_name}_best_score']
        
        assessment_features.append(best_scores)
    
    # Merge all assessment scores
    combined_assessments = assessment_features[0]
    for df in assessment_features[1:]:
        combined_assessments = combined_assessments.merge(df, on='id', how='outer')
    
    # Calculate aggregate metrics
    score_cols = ['pretest_ml_best_score', 'pretest_py_best_score', 'pretest_st_best_score']
    
    combined_assessments['avg_pretest_score'] = (
        combined_assessments[score_cols].mean(axis=1, skipna=True).round(3)
    )
    
    combined_assessments['assessment_completion_rate'] = (
        combined_assessments[score_cols].notna().sum(axis=1) / len(score_cols)
    ).round(3)
    
    # Performance category
    combined_assessments['performance_level'] = pd.cut(
        combined_assessments['avg_pretest_score'], 
        bins=[0, 0.5, 0.7, 0.85, 1.0], 
        labels=['Low', 'Medium', 'High', 'Excellent']
    )
    
    return combined_assessments

In [16]:
def create_submission_features(datasets):
    print("\n📊 Processing Submission Data...")
    
    submission_data = []
    
    # Mini project submissions
    mini_project = datasets['mini_project'].groupby('id').size().reset_index()
    mini_project.columns = ['id', 'mini_project_count']
    mini_project['has_mini_project'] = (mini_project['mini_project_count'] > 0).astype(int)
    submission_data.append(mini_project[['id', 'has_mini_project']])
    
    # Weekly quiz submissions
    weekly_quiz = datasets['weekly_quiz'].groupby('id').size().reset_index()
    weekly_quiz.columns = ['id', 'quiz_submissions']
    weekly_quiz['has_weekly_quiz'] = (weekly_quiz['quiz_submissions'] > 0).astype(int)
    submission_data.append(weekly_quiz[['id', 'has_weekly_quiz']])
    
    # Combine submission features
    combined_submissions = submission_data[0]
    for df in submission_data[1:]:
        combined_submissions = combined_submissions.merge(df, on='id', how='outer')
    
    # Fill missing values
    combined_submissions = combined_submissions.fillna(0)
    
    # Overall submission rate
    combined_submissions['total_submissions'] = (
        combined_submissions['has_mini_project'] + 
        combined_submissions['has_weekly_quiz']
    )
    
    return combined_submissions

In [17]:
def create_registration_features(df_registration):
    print("\n📊 Processing Registration Data...")
    
    reg_features = df_registration[['id', 'Status', 'Pilihan Jadwal Kelas']].copy()
    
    # Encode status
    status_mapping = {
        'Mahasiswa': 1, 'Fresh Graduates': 2, 
        'Pekerja aktif': 3, 'Umum': 4
    }
    reg_features['status_encoded'] = reg_features['Status'].map(status_mapping)
    
    # Extract batch info
    reg_features['batch'] = reg_features['Pilihan Jadwal Kelas'].str.extract(r'Batch (\d+)')
    reg_features['batch'] = pd.to_numeric(reg_features['batch'], errors='coerce')
    
    return reg_features[['id', 'status_encoded', 'batch']]

In [18]:
# Execute feature engineering
datasets = load_datasets()

# Get all unique participant IDs (missing from script 1)
print(f"\n📋 Unique participants per dataset:")
all_participant_ids = set()
for name, df in datasets.items():
    unique_ids = df['id'].nunique()
    print(f"  {name}: {unique_ids} unique IDs")
    all_participant_ids.update(df['id'].unique())

print(f"\n👥 Total unique participants across all datasets: {len(all_participant_ids)}")

# Create master participant table
master_df = pd.DataFrame({'id': list(all_participant_ids)})
print(f"✅ Master participant table created with {len(master_df)} participants")

attendance_features = create_attendance_features(datasets['absensi'])
assessment_features = create_assessment_features(datasets)
submission_features = create_submission_features(datasets)
registration_features = create_registration_features(datasets['pendaftaran'])

✅ absensi: (11714, 12)
✅ mini_project: (468, 5)
✅ pendaftaran: (492, 9)
✅ pretest_ml: (502, 14)
✅ pretest_py: (544, 14)
✅ pretest_st: (500, 19)
✅ weekly_quiz: (487, 5)

📋 Unique participants per dataset:
  absensi: 509 unique IDs
  mini_project: 468 unique IDs
  pendaftaran: 492 unique IDs
  pretest_ml: 494 unique IDs
  pretest_py: 526 unique IDs
  pretest_st: 497 unique IDs
  weekly_quiz: 483 unique IDs

👥 Total unique participants across all datasets: 549
✅ Master participant table created with 549 participants

📊 Processing Attendance Data...

📊 Processing Assessment Scores...

📊 Processing Submission Data...

📊 Processing Registration Data...


In [19]:
print(f"\n🔄 Merging all features...")
final_features = master_df.copy()

# Merge each feature set
feature_sets = [
    ('attendance', attendance_features),
    ('assessment', assessment_features), 
    ('submission', submission_features),
    ('registration', registration_features)
]

for name, features in feature_sets:
    before_count = len(final_features)
    final_features = final_features.merge(features, on='id', how='left')
    after_count = len(final_features)
    print(f"  ✅ Merged {name}: {before_count} → {after_count} rows")

print(f"\n🎯 Final feature matrix: {final_features.shape}")
print(f"Columns: {list(final_features.columns)}")

# Show sample of final features
print(f"\n📊 Sample of engineered features:")
print(final_features.head())


🔄 Merging all features...
  ✅ Merged attendance: 549 → 549 rows
  ✅ Merged assessment: 549 → 549 rows
  ✅ Merged submission: 549 → 549 rows
  ✅ Merged registration: 549 → 549 rows

🎯 Final feature matrix: (549, 19)
Columns: ['id', 'total_meetings_attended', 'highest_meeting', 'first_meeting', 'avg_material_quality', 'avg_trainer_quality', 'attendance_rate', 'engagement_score', 'pretest_ml_best_score', 'pretest_py_best_score', 'pretest_st_best_score', 'avg_pretest_score', 'assessment_completion_rate', 'performance_level', 'has_mini_project', 'has_weekly_quiz', 'total_submissions', 'status_encoded', 'batch']

📊 Sample of engineered features:
                                     id  total_meetings_attended  \
0  366c8261-243e-4eab-9695-6df81979597a                     18.0   
1  84d29b44-26c6-4b05-8a67-31831d35be32                     28.0   
2  e0766991-32fc-4294-b1e2-a0dff52c64f2                     23.0   
3  a59fa924-4733-471c-8754-f6595cd0e6f1                     27.0   
4  ab9d4c1f

In [20]:
print("🎯 TARGET CREATION")
print("="*50)

# Fill missing values untuk target creation
print("\n🔧 Preprocessing for target creation...")

# Select features for target creation
target_features = [
    'attendance_rate', 'highest_meeting', 'total_meetings_attended',
    'avg_pretest_score', 'assessment_completion_rate', 
    'total_submissions', 'engagement_score'
]

# Create a subset with target features
target_df = final_features[['id'] + target_features].copy()

# Impute missing values
imputer = SimpleImputer(strategy='median')
target_df[target_features] = imputer.fit_transform(target_df[target_features])

print(f"✅ Target features prepared for {len(target_df)} participants")

🎯 TARGET CREATION

🔧 Preprocessing for target creation...
✅ Target features prepared for 549 participants


In [21]:
# ============================================================================
# METHOD 1: RULE-BASED LABELING
# ============================================================================

def create_rule_based_labels(df):
    print("\n📋 Method 1: Rule-Based Labeling")
    print("-" * 40)
    
    # Define graduation criteria
    conditions = {
        'high_attendance': df['attendance_rate'] >= 0.75,  # Attended 75%+ meetings
        'reached_advanced': df['highest_meeting'] >= 20,   # Reached meeting 20+
        'good_performance': df['avg_pretest_score'] >= 0.6, # Average score 60%+
        'active_submission': df['total_submissions'] >= 1,  # Submitted at least 1 project
        'high_engagement': df['engagement_score'] >= 0.7   # Overall engagement 70%+
    }
    
    # Calculate completion score
    df['completion_score'] = (
        conditions['high_attendance'].astype(int) * 0.35 +      # Attendance weight
        conditions['reached_advanced'].astype(int) * 0.25 +     # Progress weight  
        conditions['good_performance'].astype(int) * 0.2 +      # Performance weight
        conditions['active_submission'].astype(int) * 0.1 +     # Submission weight
        conditions['high_engagement'].astype(int) * 0.1         # Engagement weight
    )
    
    # Create binary labels
    GRADUATION_THRESHOLD = 0.6  # Need 60% overall score to graduate
    df['graduated_rule'] = (df['completion_score'] >= GRADUATION_THRESHOLD).astype(int)
    
    # Show criteria breakdown
    print("Graduation Criteria:")
    for criterion, condition in conditions.items():
        count = condition.sum()
        pct = count / len(df) * 100
        print(f"  {criterion}: {count} participants ({pct:.1f}%)")
    
    graduate_count = df['graduated_rule'].sum()
    print(f"\n🎓 Rule-based graduation rate: {graduate_count}/{len(df)} ({graduate_count/len(df)*100:.1f}%)")
    
    return df

In [22]:
# ============================================================================
# METHOD 2: CLUSTERING-BASED LABELING  
# ============================================================================

def create_cluster_based_labels(df):
    print("\n🔍 Method 2: Clustering-Based Labeling")
    print("-" * 40)
    
    # Prepare features for clustering
    cluster_features = [
        'attendance_rate', 'avg_pretest_score', 'total_submissions', 
        'engagement_score', 'highest_meeting'
    ]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[cluster_features])
    
    # Apply K-means clustering (k=3: low, medium, high performers)
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(X_scaled)
    
    # Analyze clusters to identify high performers
    cluster_analysis = df.groupby('cluster')[cluster_features].mean().round(3)
    print("Cluster Analysis:")
    print(cluster_analysis)
    
    # Identify the "high performer" cluster (highest attendance + performance)
    cluster_scores = cluster_analysis['attendance_rate'] + cluster_analysis['avg_pretest_score']
    high_performer_cluster = cluster_scores.idxmax()
    
    # Create binary labels (high performer cluster = graduated)
    df['graduated_cluster'] = (df['cluster'] == high_performer_cluster).astype(int)
    
    cluster_counts = df['cluster'].value_counts().sort_index()
    print(f"\nCluster distribution:")
    for cluster, count in cluster_counts.items():
        pct = count / len(df) * 100
        status = "HIGH PERFORMER" if cluster == high_performer_cluster else "REGULAR"
        print(f"  Cluster {cluster}: {count} participants ({pct:.1f}%) - {status}")
    
    graduate_count = df['graduated_cluster'].sum()
    print(f"\n🎓 Cluster-based graduation rate: {graduate_count}/{len(df)} ({graduate_count/len(df)*100:.1f}%)")
    
    return df

In [23]:
# ============================================================================
# EXECUTE TARGET CREATION
# ============================================================================

# Create labels using both methods
target_df = create_rule_based_labels(target_df)
target_df = create_cluster_based_labels(target_df)


📋 Method 1: Rule-Based Labeling
----------------------------------------
Graduation Criteria:
  high_attendance: 320 participants (58.3%)
  reached_advanced: 465 participants (84.7%)
  good_performance: 527 participants (96.0%)
  active_submission: 549 participants (100.0%)
  high_engagement: 518 participants (94.4%)

🎓 Rule-based graduation rate: 436/549 (79.4%)

🔍 Method 2: Clustering-Based Labeling
----------------------------------------
Cluster Analysis:
         attendance_rate  avg_pretest_score  total_submissions  \
cluster                                                          
0                  0.909              0.725              1.945   
1                  0.698              0.781              1.975   
2                  0.000              0.784              1.828   

         engagement_score  highest_meeting  
cluster                                     
0                   0.906           27.326  
1                   0.779           21.042  
2                   0.36

In [24]:
# ============================================================================
# LABEL VALIDATION & COMPARISON
# ============================================================================

print("\n🔍 LABEL VALIDATION")
print("="*50)

# Compare the two labeling methods
agreement = (target_df['graduated_rule'] == target_df['graduated_cluster']).sum()
agreement_rate = agreement / len(target_df) * 100

print(f"Agreement between methods: {agreement}/{len(target_df)} ({agreement_rate:.1f}%)")

# Cross-tabulation
crosstab = pd.crosstab(
    target_df['graduated_rule'], 
    target_df['graduated_cluster'], 
    margins=True
)
print(f"\nCross-tabulation:")
print(crosstab)


🔍 LABEL VALIDATION
Agreement between methods: 349/549 (63.6%)

Cross-tabulation:
graduated_cluster    0    1  All
graduated_rule                  
0                  113    0  113
1                  200  236  436
All                313  236  549


In [25]:
# ============================================================================
# FINAL TARGET SELECTION
# ============================================================================

print(f"\n🎯 FINAL TARGET SELECTION")
print("-" * 30)

# Use ensemble approach: agree on both methods OR high completion score
target_df['graduated_final'] = (
    (target_df['graduated_rule'] == 1) & 
    (target_df['graduated_cluster'] == 1)
).astype(int)

# For cases where methods disagree, use completion score as tiebreaker
disagreement_mask = target_df['graduated_rule'] != target_df['graduated_cluster']
high_score_mask = target_df['completion_score'] >= 0.65

target_df.loc[disagreement_mask & high_score_mask, 'graduated_final'] = 1

final_graduate_count = target_df['graduated_final'].sum()
final_rate = final_graduate_count / len(target_df) * 100

print(f"Final graduation rate: {final_graduate_count}/{len(target_df)} ({final_rate:.1f}%)")

# Show final target distribution
print(f"\nFinal Target Distribution:")
print(f"  Graduated (1): {final_graduate_count} participants")
print(f"  Not Graduated (0): {len(target_df) - final_graduate_count} participants")

# Class balance check
if final_rate < 30 or final_rate > 70:
    print(f"⚠️  Class imbalance detected! Consider SMOTE for modeling.")
else:
    print(f"✅ Reasonable class balance for modeling.")

# Merge final target back to feature matrix
final_features_with_target = final_features.merge(
    target_df[['id', 'graduated_final', 'completion_score']], 
    on='id', 
    how='left'
)

print(f"\n🎉 FINAL DATASET READY!")
print(f"Shape: {final_features_with_target.shape}")
print(f"Features: {final_features_with_target.shape[1] - 1}")  # -1 for target
print(f"Target column: 'graduated_final'")

# Save the final dataset
# final_features_with_target.to_csv('bootcamp_final_dataset.csv', index=False)
# print(f"💾 Dataset saved as 'bootcamp_final_dataset.csv'")

# Show sample of final dataset
print(f"\n📊 Final Dataset Sample:")
sample_cols = ['id', 'attendance_rate', 'avg_pretest_score', 'total_submissions', 'graduated_final']
print(final_features_with_target[sample_cols].head(10))


🎯 FINAL TARGET SELECTION
------------------------------
Final graduation rate: 436/549 (79.4%)

Final Target Distribution:
  Graduated (1): 436 participants
  Not Graduated (0): 113 participants
⚠️  Class imbalance detected! Consider SMOTE for modeling.

🎉 FINAL DATASET READY!
Shape: (549, 21)
Features: 20
Target column: 'graduated_final'

📊 Final Dataset Sample:
                                     id  attendance_rate  avg_pretest_score  \
0  366c8261-243e-4eab-9695-6df81979597a            0.600              0.800   
1  84d29b44-26c6-4b05-8a67-31831d35be32            0.933              0.783   
2  e0766991-32fc-4294-b1e2-a0dff52c64f2            0.767              0.767   
3  a59fa924-4733-471c-8754-f6595cd0e6f1            0.900              0.683   
4  ab9d4c1f-1773-4f12-88e1-ecd78f51e309            1.000              0.867   
5  f29b1b86-ab96-4876-889d-70b5955a01c7            0.900              0.600   
6  e62e4b0c-e929-48f4-ad21-f172df57e747            0.767              0.833   
7

In [28]:
# Prepare features for modeling
feature_cols = [col for col in final_features_with_target.columns 
                if col not in ['id', 'graduated_final', 'completion_score', 'performance_level']]

X = final_features_with_target[feature_cols]
y = final_features_with_target['graduated_final']

# Handle missing values
X = X.fillna(X.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE untuk handle class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_balanced, y_train_balanced)

# Predict & evaluate
y_pred = rf.predict(X_test)
balanced_acc = balanced_accuracy_score(y_test, y_pred)

In [29]:
# Check which features are most important
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

Top 10 Most Important Features:
                       feature  importance
0      total_meetings_attended    0.341385
5              attendance_rate    0.325185
6             engagement_score    0.164568
1              highest_meeting    0.119839
4          avg_trainer_quality    0.014244
3         avg_material_quality    0.009448
7        pretest_ml_best_score    0.007874
11  assessment_completion_rate    0.007718
8        pretest_py_best_score    0.003823
10           avg_pretest_score    0.003274


In [30]:
# More conservative target (only if both methods agree)
final_features_with_target['graduated_conservative'] = (
    (target_df['graduated_rule'] == 1) & 
    (target_df['graduated_cluster'] == 1)
).astype(int)

# This would give ~43% graduation rate (more balanced)

In [32]:
print("🤖 MODEL BUILDING & EVALUATION")
print("="*70)

# ============================================================================
# 1. DATA PREPARATION
# ============================================================================
print("\n📊 Step 1: Data Preparation")
print("-" * 40)

# Prepare features (exclude non-predictive columns)
exclude_cols = ['id', 'graduated_final', 'completion_score', 'performance_level']
feature_cols = [col for col in final_features_with_target.columns if col not in exclude_cols]

print(f"Available features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Prepare X and y
X = final_features_with_target[feature_cols].copy()
y = final_features_with_target['graduated_final'].copy()

# Handle missing values
print(f"\nMissing values before imputation:")
print(X.isnull().sum().sum())

# Simple imputation strategy
for col in X.columns:
    if X[col].dtype in ['float64', 'int64']:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else 'Unknown')

print(f"Missing values after imputation: {X.isnull().sum().sum()}")

# Check target distribution
target_dist = y.value_counts().sort_index()
print(f"\nTarget distribution:")
for label, count in target_dist.items():
    pct = count / len(y) * 100
    status = "Graduated" if label == 1 else "Not Graduated"
    print(f"  {status} ({label}): {count} ({pct:.1f}%)")

🤖 MODEL BUILDING & EVALUATION

📊 Step 1: Data Preparation
----------------------------------------
Available features: 18
Features: ['total_meetings_attended', 'highest_meeting', 'first_meeting', 'avg_material_quality', 'avg_trainer_quality', 'attendance_rate', 'engagement_score', 'pretest_ml_best_score', 'pretest_py_best_score', 'pretest_st_best_score', 'avg_pretest_score', 'assessment_completion_rate', 'has_mini_project', 'has_weekly_quiz', 'total_submissions', 'status_encoded', 'batch', 'graduated_conservative']

Missing values before imputation:
925
Missing values after imputation: 0

Target distribution:
  Not Graduated (0): 113 (20.6%)
  Graduated (1): 436 (79.4%)


In [33]:
print(f"\n🔄 Step 2: Train-Test Split")
print("-" * 40)

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Check class distribution in splits
train_dist = y_train.value_counts(normalize=True).sort_index()
test_dist = y_test.value_counts(normalize=True).sort_index()

print(f"\nClass distribution:")
print(f"  Train: Not Graduated={train_dist[0]:.3f}, Graduated={train_dist[1]:.3f}")
print(f"  Test:  Not Graduated={test_dist[0]:.3f}, Graduated={test_dist[1]:.3f}")


🔄 Step 2: Train-Test Split
----------------------------------------
Training set: (439, 18)
Test set: (110, 18)

Class distribution:
  Train: Not Graduated=0.205, Graduated=0.795
  Test:  Not Graduated=0.209, Graduated=0.791


In [36]:
# ============================================================================
# 3. MODEL DEFINITIONS
# ============================================================================

print(f"\n🧠 Step 3: Model Definitions")
print("-" * 40)

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

print(f"Models to evaluate: {list(models.keys())}")


🧠 Step 3: Model Definitions
----------------------------------------
Models to evaluate: ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 'SVM']


In [40]:
# ============================================================================
# 4. CROSS-VALIDATION EVALUATION
# ============================================================================

print(f"\n📈 Step 4: Cross-Validation Evaluation")
print("-" * 40)

# Store results
cv_results = {}
cv_scores_balanced = {}

# Evaluate each model with and without SMOTE
for name, model in models.items():
    print(f"\n🔍 Evaluating {name}...")
    
    # Without SMOTE
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='balanced_accuracy')
    cv_scores_balanced[f"{name}"] = scores
    
    print(f"  Without SMOTE - Balanced Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    
    # With SMOTE (using pipeline to avoid data leakage)
    smote_pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])
    
    scores_smote = cross_val_score(smote_pipeline, X_train, y_train, cv=5, scoring='balanced_accuracy')
    cv_scores_balanced[f"{name} + SMOTE"] = scores_smote
    
    print(f"  With SMOTE    - Balanced Accuracy: {scores_smote.mean():.3f} (+/- {scores_smote.std() * 2:.3f})")


📈 Step 4: Cross-Validation Evaluation
----------------------------------------

🔍 Evaluating Logistic Regression...
  Without SMOTE - Balanced Accuracy: 1.000 (+/- 0.000)
  With SMOTE    - Balanced Accuracy: 1.000 (+/- 0.000)

🔍 Evaluating Random Forest...
  Without SMOTE - Balanced Accuracy: 1.000 (+/- 0.000)
  With SMOTE    - Balanced Accuracy: 1.000 (+/- 0.000)

🔍 Evaluating Gradient Boosting...
  Without SMOTE - Balanced Accuracy: 1.000 (+/- 0.000)
  With SMOTE    - Balanced Accuracy: 1.000 (+/- 0.000)

🔍 Evaluating SVM...
  Without SMOTE - Balanced Accuracy: 0.872 (+/- 0.227)
  With SMOTE    - Balanced Accuracy: 0.947 (+/- 0.015)


In [41]:
# ============================================================================
# 5. BEST MODEL SELECTION & TRAINING
# ============================================================================

print(f"\n🏆 Step 5: Best Model Selection")
print("-" * 40)

# Find best performing combination
best_score = 0
best_config = None

for config_name, scores in cv_scores_balanced.items():
    mean_score = scores.mean()
    if mean_score > best_score:
        best_score = mean_score
        best_config = config_name

print(f"Best configuration: {best_config}")
print(f"Best CV balanced accuracy: {best_score:.3f}")

# Train the best model
use_smote = 'SMOTE' in best_config
model_name = best_config.replace(' + SMOTE', '')
best_model = models[model_name]

if use_smote:
    # Apply SMOTE to training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    print(f"\nSMOTE applied:")
    print(f"  Before: {X_train.shape[0]} samples")
    print(f"  After:  {X_train_resampled.shape[0]} samples")
    
    # Train on resampled data
    best_model.fit(X_train_resampled, y_train_resampled)
else:
    # Train on original data
    best_model.fit(X_train, y_train)


🏆 Step 5: Best Model Selection
----------------------------------------
Best configuration: Logistic Regression
Best CV balanced accuracy: 1.000


In [44]:
# ============================================================================
# 6. MODEL EVALUATION ON TEST SET
# ============================================================================

print(f"\n📊 Step 6: Test Set Evaluation")
print("-" * 40)

# Make predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
balanced_acc = balanced_accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Test Set Performance:")
print(f"  Balanced Accuracy: {balanced_acc:.3f}")
print(f"  ROC AUC: {roc_auc:.3f}")

# Detailed classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Graduated', 'Graduated']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(f"                 Predicted")
print(f"               0     1")
print(f"Actual   0    {cm[0,0]:3d}   {cm[0,1]:3d}")
print(f"         1    {cm[1,0]:3d}   {cm[1,1]:3d}")


📊 Step 6: Test Set Evaluation
----------------------------------------
Test Set Performance:
  Balanced Accuracy: 1.000
  ROC AUC: 1.000

Classification Report:
               precision    recall  f1-score   support

Not Graduated       1.00      1.00      1.00        23
    Graduated       1.00      1.00      1.00        87

     accuracy                           1.00       110
    macro avg       1.00      1.00      1.00       110
 weighted avg       1.00      1.00      1.00       110


Confusion Matrix:
                 Predicted
               0     1
Actual   0     23     0
         1      0    87


In [45]:
# ============================================================================
# 7. FEATURE IMPORTANCE ANALYSIS
# ============================================================================

print(f"\n🎯 Step 7: Feature Importance Analysis")
print("-" * 40)

# Get feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']:<25}: {row['importance']:.3f}")
        
    # Store for visualization
    top_features = feature_importance.head(15)
    
elif hasattr(best_model, 'coef_'):
    # For linear models, use coefficient magnitude
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': abs(best_model.coef_[0])
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features (by coefficient magnitude):")
    for i, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']:<25}: {row['importance']:.3f}")
        
    top_features = feature_importance.head(15)



🎯 Step 7: Feature Importance Analysis
----------------------------------------
Top 10 Most Important Features (by coefficient magnitude):
  total_meetings_attended  : 2.549
  highest_meeting          : 2.549
  status_encoded           : 0.274
  pretest_ml_best_score    : 0.219
  attendance_rate          : 0.087
  avg_pretest_score        : 0.073
  has_mini_project         : 0.066
  total_submissions        : 0.062
  engagement_score         : 0.053
  batch                    : 0.051


In [46]:
# ============================================================================
# 8. MODEL INSIGHTS & RECOMMENDATIONS
# ============================================================================

print(f"\n💡 Step 8: Model Insights & Recommendations")
print("-" * 40)

print(f"Model Performance Summary:")
print(f"  ✅ Best Model: {best_config}")
print(f"  ✅ Balanced Accuracy: {balanced_acc:.3f}")
print(f"  ✅ ROC AUC: {roc_auc:.3f}")

# Performance interpretation
if balanced_acc >= 0.8:
    performance = "Excellent"
elif balanced_acc >= 0.7:
    performance = "Good"
elif balanced_acc >= 0.6:
    performance = "Fair"
else:
    performance = "Poor"

print(f"  📈 Performance Rating: {performance}")

# Business insights
print(f"\n🎯 Business Insights:")
if 'attendance_rate' in feature_importance.head(5)['feature'].values:
    print(f"  • Attendance rate is a key predictor of graduation")
if 'avg_pretest_score' in feature_importance.head(5)['feature'].values:
    print(f"  • Academic performance (pretest scores) strongly indicates success")
if 'engagement_score' in feature_importance.head(5)['feature'].values:
    print(f"  • Student engagement level is crucial for completion")

print(f"\n📋 Recommendations for Model Deployment:")
print(f"  1. Use {best_config} for production predictions")
print(f"  2. Monitor key features: {', '.join(feature_importance.head(3)['feature'].values)}")
print(f"  3. Set prediction threshold based on business needs")
print(f"  4. Regularly retrain model with new data")

if use_smote:
    print(f"  5. Apply SMOTE for handling class imbalance in future training")

print(f"\n✅ Model building completed successfully!")


💡 Step 8: Model Insights & Recommendations
----------------------------------------
Model Performance Summary:
  ✅ Best Model: Logistic Regression
  ✅ Balanced Accuracy: 1.000
  ✅ ROC AUC: 1.000
  📈 Performance Rating: Excellent

🎯 Business Insights:
  • Attendance rate is a key predictor of graduation

📋 Recommendations for Model Deployment:
  1. Use Logistic Regression for production predictions
  2. Monitor key features: total_meetings_attended, highest_meeting, status_encoded
  3. Set prediction threshold based on business needs
  4. Regularly retrain model with new data

✅ Model building completed successfully!


In [47]:
# ============================================================================
# 9. SAVE MODEL RESULTS
# ============================================================================

# Create results summary
model_results = {
    'best_model': best_config,
    'balanced_accuracy': balanced_acc,
    'roc_auc': roc_auc,
    'feature_importance': feature_importance.to_dict('records'),
    'confusion_matrix': cm.tolist(),
    'use_smote': use_smote
}

print(f"\n💾 Model results ready for saving...")
print(f"   Results stored in 'model_results' dictionary")
print(f"   Trained model available as 'best_model' object")


💾 Model results ready for saving...
   Results stored in 'model_results' dictionary
   Trained model available as 'best_model' object


In [48]:
print("🔮 TEST DATA PROCESSING & PREDICTION")
print("="*70)

# ============================================================================
# 1. LOAD TEST DATASETS
# ============================================================================

print("\n📂 Step 1: Loading Test Datasets")
print("-" * 40)

# Test file mapping
test_file_names = {
    'absensi': 'MineToday Dataset/test/test_absensi.csv',
    'mini_project': 'MineToday Dataset/test/test_mini_project.csv', 
    'pendaftaran': 'MineToday Dataset/test/test_pendaftaran.csv',
    'pretest_ml': 'MineToday Dataset/test/test_pretest_ml.csv',
    'pretest_py': 'MineToday Dataset/test/test_pretest_py.csv',
    'pretest_st': 'MineToday Dataset/test/test_pretest_st.csv',
    'weekly_quiz': 'MineToday Dataset/test/test_weekly_quiz.csv'
}

def load_test_datasets():
    test_datasets = {}
    for name, path in test_file_names.items():
        try:
            test_datasets[name] = pd.read_csv(path)
            print(f"✅ {name}: {test_datasets[name].shape}")
        except FileNotFoundError:
            print(f"❌ {name}: File not found at {path}")
        except Exception as e:
            print(f"❌ {name}: Error loading - {e}")
    return test_datasets

# Load test data
test_datasets = load_test_datasets()

# Get unique test participant IDs
print(f"\n📋 Unique participants per test dataset:")
test_participant_ids = set()
for name, df in test_datasets.items():
    if 'id' in df.columns:
        unique_ids = df['id'].nunique()
        print(f"  {name}: {unique_ids} unique IDs")
        test_participant_ids.update(df['id'].unique())

print(f"\n👥 Total unique test participants: {len(test_participant_ids)}")

# Create master test participant table
test_master_df = pd.DataFrame({'id': list(test_participant_ids)})
print(f"✅ Test master table created with {len(test_master_df)} participants")

🔮 TEST DATA PROCESSING & PREDICTION

📂 Step 1: Loading Test Datasets
----------------------------------------
✅ absensi: (760, 12)
✅ mini_project: (22, 5)
✅ pendaftaran: (21, 9)
✅ pretest_ml: (32, 14)
✅ pretest_py: (47, 14)
✅ pretest_st: (33, 19)
✅ weekly_quiz: (25, 5)

📋 Unique participants per test dataset:
  absensi: 48 unique IDs
  mini_project: 22 unique IDs
  pendaftaran: 21 unique IDs
  pretest_ml: 27 unique IDs
  pretest_py: 40 unique IDs
  pretest_st: 30 unique IDs
  weekly_quiz: 23 unique IDs

👥 Total unique test participants: 48
✅ Test master table created with 48 participants


In [50]:
#============================================================================
# 2. FEATURE ENGINEERING FOR TEST DATA (REUSE FUNCTIONS)
# ============================================================================

print(f"\n🛠️ Step 2: Feature Engineering for Test Data")
print("-" * 40)

# Reuse the same feature engineering functions from training
print("📊 Processing test attendance data...")
if 'absensi' in test_datasets:
    test_attendance_features = create_attendance_features(test_datasets['absensi'])
    print(f"  ✅ Test attendance features: {len(test_attendance_features)} participants")

print("📊 Processing test assessment data...")
test_assessment_features = create_assessment_features(test_datasets)
print(f"  ✅ Test assessment features: {len(test_assessment_features)} participants")

print("📊 Processing test submission data...")
test_submission_features = create_submission_features(test_datasets)
print(f"  ✅ Test submission features: {len(test_submission_features)} participants")

print("📊 Processing test registration data...")
if 'pendaftaran' in test_datasets:
    test_registration_features = create_registration_features(test_datasets['pendaftaran'])
    print(f"  ✅ Test registration features: {len(test_registration_features)} participants")


🛠️ Step 2: Feature Engineering for Test Data
----------------------------------------
📊 Processing test attendance data...

📊 Processing Attendance Data...
  ✅ Test attendance features: 48 participants
📊 Processing test assessment data...

📊 Processing Assessment Scores...
  ✅ Test assessment features: 42 participants
📊 Processing test submission data...

📊 Processing Submission Data...
  ✅ Test submission features: 26 participants
📊 Processing test registration data...

📊 Processing Registration Data...
  ✅ Test registration features: 21 participants


In [51]:
# ============================================================================
# 3. MERGE TEST FEATURES
# ============================================================================

print(f"\n🔄 Step 3: Merging Test Features")
print("-" * 40)

# Merge all test features
final_test_features = test_master_df.copy()

test_feature_sets = [
    ('attendance', test_attendance_features),
    ('assessment', test_assessment_features), 
    ('submission', test_submission_features),
    ('registration', test_registration_features)
]

for name, features in test_feature_sets:
    if features is not None and len(features) > 0:
        before_count = len(final_test_features)
        final_test_features = final_test_features.merge(features, on='id', how='left')
        after_count = len(final_test_features)
        print(f"  ✅ Merged test {name}: {before_count} → {after_count} rows")

print(f"\n🎯 Final test feature matrix: {final_test_features.shape}")
print(f"Test columns: {list(final_test_features.columns)}")


🔄 Step 3: Merging Test Features
----------------------------------------
  ✅ Merged test attendance: 48 → 48 rows
  ✅ Merged test assessment: 48 → 48 rows
  ✅ Merged test submission: 48 → 48 rows
  ✅ Merged test registration: 48 → 48 rows

🎯 Final test feature matrix: (48, 19)
Test columns: ['id', 'total_meetings_attended', 'highest_meeting', 'first_meeting', 'avg_material_quality', 'avg_trainer_quality', 'attendance_rate', 'engagement_score', 'pretest_ml_best_score', 'pretest_py_best_score', 'pretest_st_best_score', 'avg_pretest_score', 'assessment_completion_rate', 'performance_level', 'has_mini_project', 'has_weekly_quiz', 'total_submissions', 'status_encoded', 'batch']


In [52]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

print("🔮 TEST DATA PROCESSING & PREDICTION")
print("="*70)

# ============================================================================
# 1. LOAD TEST DATASETS
# ============================================================================

print("\n📂 Step 1: Loading Test Datasets")
print("-" * 40)

# Test file mapping
test_file_names = {
    'absensi': 'MineToday Dataset/test/test_absensi.csv',
    'mini_project': 'MineToday Dataset/test/test_mini_project.csv', 
    'pendaftaran': 'MineToday Dataset/test/test_pendaftaran.csv',
    'pretest_ml': 'MineToday Dataset/test/test_pretest_ml.csv',
    'pretest_py': 'MineToday Dataset/test/test_pretest_py.csv',
    'pretest_st': 'MineToday Dataset/test/test_pretest_st.csv',
    'weekly_quiz': 'MineToday Dataset/test/test_weekly_quiz.csv'
}

def load_test_datasets():
    test_datasets = {}
    for name, path in test_file_names.items():
        try:
            test_datasets[name] = pd.read_csv(path)
            print(f"✅ {name}: {test_datasets[name].shape}")
        except FileNotFoundError:
            print(f"❌ {name}: File not found at {path}")
        except Exception as e:
            print(f"❌ {name}: Error loading - {e}")
    return test_datasets

# Load test data
test_datasets = load_test_datasets()

# Get unique test participant IDs
print(f"\n📋 Unique participants per test dataset:")
test_participant_ids = set()
for name, df in test_datasets.items():
    if 'id' in df.columns:
        unique_ids = df['id'].nunique()
        print(f"  {name}: {unique_ids} unique IDs")
        test_participant_ids.update(df['id'].unique())

print(f"\n👥 Total unique test participants: {len(test_participant_ids)}")

# Create master test participant table
test_master_df = pd.DataFrame({'id': list(test_participant_ids)})
print(f"✅ Test master table created with {len(test_master_df)} participants")

# ============================================================================
# 2. FEATURE ENGINEERING FOR TEST DATA (REUSE FUNCTIONS)
# ============================================================================

print(f"\n🛠️ Step 2: Feature Engineering for Test Data")
print("-" * 40)

# Reuse the same feature engineering functions from training
print("📊 Processing test attendance data...")
if 'absensi' in test_datasets:
    test_attendance_features = create_attendance_features(test_datasets['absensi'])
    print(f"  ✅ Test attendance features: {len(test_attendance_features)} participants")

print("📊 Processing test assessment data...")
test_assessment_features = create_assessment_features(test_datasets)
print(f"  ✅ Test assessment features: {len(test_assessment_features)} participants")

print("📊 Processing test submission data...")
test_submission_features = create_submission_features(test_datasets)
print(f"  ✅ Test submission features: {len(test_submission_features)} participants")

print("📊 Processing test registration data...")
if 'pendaftaran' in test_datasets:
    test_registration_features = create_registration_features(test_datasets['pendaftaran'])
    print(f"  ✅ Test registration features: {len(test_registration_features)} participants")

# ============================================================================
# 3. MERGE TEST FEATURES
# ============================================================================

print(f"\n🔄 Step 3: Merging Test Features")
print("-" * 40)

# Merge all test features
final_test_features = test_master_df.copy()

test_feature_sets = [
    ('attendance', test_attendance_features),
    ('assessment', test_assessment_features), 
    ('submission', test_submission_features),
    ('registration', test_registration_features)
]

for name, features in test_feature_sets:
    if features is not None and len(features) > 0:
        before_count = len(final_test_features)
        final_test_features = final_test_features.merge(features, on='id', how='left')
        after_count = len(final_test_features)
        print(f"  ✅ Merged test {name}: {before_count} → {after_count} rows")

print(f"\n🎯 Final test feature matrix: {final_test_features.shape}")
print(f"Test columns: {list(final_test_features.columns)}")

# ============================================================================
# 4. PREPARE TEST DATA FOR PREDICTION
# ============================================================================

print(f"\n🔧 Step 4: Prepare Test Data for Prediction")
print("-" * 40)

# Use the same feature columns as training
# (Exclude target and non-predictive columns)
exclude_cols = ['id', 'graduated_final', 'completion_score', 'performance_level']
feature_cols_for_prediction = [col for col in final_features_with_target.columns 
                               if col not in exclude_cols]

print(f"Expected feature columns: {len(feature_cols_for_prediction)}")

# Select only the feature columns that exist in test data
available_features = []
missing_features = []

for col in feature_cols_for_prediction:
    if col in final_test_features.columns:
        available_features.append(col)
    else:
        missing_features.append(col)

print(f"Available features in test: {len(available_features)}")
if missing_features:
    print(f"Missing features in test: {missing_features}")
    # Fill missing features with default values
    for col in missing_features:
        if col in final_features_with_target.columns:
            # Use median/mode from training data
            if final_features_with_target[col].dtype in ['float64', 'int64']:
                default_value = final_features_with_target[col].median()
            else:
                default_value = final_features_with_target[col].mode()[0] if not final_features_with_target[col].mode().empty else 'Unknown'
            final_test_features[col] = default_value
            available_features.append(col)
            print(f"  ✅ Filled missing feature '{col}' with default value: {default_value}")

# Prepare test feature matrix
X_test_final = final_test_features[available_features].copy()

# Handle missing values in test data (same strategy as training)
print(f"\nHandling missing values in test data...")
print(f"Missing values before imputation: {X_test_final.isnull().sum().sum()}")

for col in X_test_final.columns:
    if X_test_final[col].dtype in ['float64', 'int64']:
        # Use training data statistics for imputation
        if col in final_features_with_target.columns:
            fill_value = final_features_with_target[col].median()
        else:
            fill_value = X_test_final[col].median()
        X_test_final[col] = X_test_final[col].fillna(fill_value)
    else:
        if col in final_features_with_target.columns:
            fill_value = final_features_with_target[col].mode()[0] if not final_features_with_target[col].mode().empty else 'Unknown'
        else:
            fill_value = X_test_final[col].mode()[0] if not X_test_final[col].mode().empty else 'Unknown'
        X_test_final[col] = X_test_final[col].fillna(fill_value)

print(f"Missing values after imputation: {X_test_final.isnull().sum().sum()}")

🔮 TEST DATA PROCESSING & PREDICTION

📂 Step 1: Loading Test Datasets
----------------------------------------
✅ absensi: (760, 12)
✅ mini_project: (22, 5)
✅ pendaftaran: (21, 9)
✅ pretest_ml: (32, 14)
✅ pretest_py: (47, 14)
✅ pretest_st: (33, 19)
✅ weekly_quiz: (25, 5)

📋 Unique participants per test dataset:
  absensi: 48 unique IDs
  mini_project: 22 unique IDs
  pendaftaran: 21 unique IDs
  pretest_ml: 27 unique IDs
  pretest_py: 40 unique IDs
  pretest_st: 30 unique IDs
  weekly_quiz: 23 unique IDs

👥 Total unique test participants: 48
✅ Test master table created with 48 participants

🛠️ Step 2: Feature Engineering for Test Data
----------------------------------------
📊 Processing test attendance data...

📊 Processing Attendance Data...
  ✅ Test attendance features: 48 participants
📊 Processing test assessment data...

📊 Processing Assessment Scores...
  ✅ Test assessment features: 42 participants
📊 Processing test submission data...

📊 Processing Submission Data...
  ✅ Test submi

In [53]:
# ============================================================================
# 5. GENERATE PREDICTIONS
# ============================================================================

print(f"\n🔮 Step 5: Generate Predictions")
print("-" * 40)

# Make predictions using the best trained model
print("Generating predictions with trained model...")

try:
    # Predict probabilities and binary labels
    test_predictions_proba = best_model.predict_proba(X_test_final)[:, 1]
    test_predictions_binary = best_model.predict(X_test_final)
    
    print(f"✅ Predictions generated for {len(test_predictions_binary)} test participants")
    
    # Create results dataframe
    test_results = pd.DataFrame({
        'id': final_test_features['id'],
        'graduated_probability': test_predictions_proba,
        'graduated_prediction': test_predictions_binary
    })
    
    # Show prediction distribution
    pred_dist = pd.Series(test_predictions_binary).value_counts().sort_index()
    print(f"\nTest Prediction Distribution:")
    for label, count in pred_dist.items():
        pct = count / len(test_predictions_binary) * 100
        status = "Graduated" if label == 1 else "Not Graduated"
        print(f"  {status} ({label}): {count} ({pct:.1f}%)")
    
    print(f"\nPrediction Statistics:")
    print(f"  Mean probability: {test_predictions_proba.mean():.3f}")
    print(f"  Std probability:  {test_predictions_proba.std():.3f}")
    print(f"  Min probability:  {test_predictions_proba.min():.3f}")
    print(f"  Max probability:  {test_predictions_proba.max():.3f}")
    
except Exception as e:
    print(f"❌ Error generating predictions: {e}")
    print("Make sure the trained model 'best_model' is available from previous steps")



🔮 Step 5: Generate Predictions
----------------------------------------
Generating predictions with trained model...
✅ Predictions generated for 48 test participants

Test Prediction Distribution:
  Not Graduated (0): 33 (68.8%)
  Graduated (1): 15 (31.2%)

Prediction Statistics:
  Mean probability: 0.314
  Std probability:  0.463
  Min probability:  0.000
  Max probability:  1.000


In [56]:
# ============================================================================
# 6. PREPARE SUBMISSION FILE
# ============================================================================

print(f"\n📄 Step 6: Prepare Submission File")
print("-" * 40)

# Debug: Check if test_results exists and its structure
try:
    print(f"Test results shape: {test_results.shape}")
    print(f"Test results columns: {list(test_results.columns)}")
    print(f"Test results sample:")
    print(test_results.head(3))
except NameError:
    print("❌ test_results not defined. Creating it now...")
    test_results = pd.DataFrame({
        'id': final_test_features['id'],
        'graduated_probability': test_predictions_proba,
        'graduated_prediction': test_predictions_binary
    })
    print(f"✅ Created test_results: {test_results.shape}")

# Load prediction_id.csv to get required format
try:
    prediction_ids = pd.read_csv('MineToday Dataset/prediction_id.csv')
    print(f"✅ Loaded prediction_id.csv: {prediction_ids.shape}")
    print(f"Columns: {list(prediction_ids.columns)}")
    print(f"Sample data:")
    print(prediction_ids.head(3))
    
    # Check if 'id' column exists in prediction_ids (handle both cases)
    id_col = None
    if 'id' in prediction_ids.columns:
        id_col = 'id'
    elif 'ID' in prediction_ids.columns:
        id_col = 'ID'
        prediction_ids = prediction_ids.rename(columns={'ID': 'id'})
        print(f"Renamed 'ID' column to 'id'")
    elif 'Id' in prediction_ids.columns:
        id_col = 'Id'
        prediction_ids = prediction_ids.rename(columns={'Id': 'id'})
        print(f"Renamed 'Id' column to 'id'")
    
    if id_col is None:
        # Try to find ID column with different name
        possible_id_cols = [col for col in prediction_ids.columns if 'id' in col.lower()]
        if possible_id_cols:
            id_col = possible_id_cols[0]
            print(f"Found ID column: '{id_col}', renaming to 'id'")
            prediction_ids = prediction_ids.rename(columns={id_col: 'id'})
        else:
            print(f"❌ No ID column found in prediction_id.csv")
            print(f"Available columns: {list(prediction_ids.columns)}")
            raise KeyError("No ID column found")
    
    # Debug: Check test_results structure before merge
    print(f"\nBefore merge:")
    print(f"  prediction_ids IDs: {prediction_ids['id'].nunique()}")
    print(f"  test_results IDs: {test_results['id'].nunique()}")
    
    # Check for overlap
    common_ids = set(prediction_ids['id']).intersection(set(test_results['id']))
    print(f"  Common IDs: {len(common_ids)}")
    
    if len(common_ids) == 0:
        print("⚠️  Warning: No common IDs found between prediction_id.csv and test data")
        print("Sample prediction_ids IDs:", prediction_ids['id'].head(3).tolist())
        print("Sample test_results IDs:", test_results['id'].head(3).tolist())
    
    # Merge with predictions
    submission_df = prediction_ids.merge(
        test_results[['id', 'graduated_prediction']], 
        on='id', 
        how='left'
    )
    
    print(f"✅ Merge completed: {submission_df.shape}")
    
    # Check for missing predictions
    missing_predictions = submission_df['graduated_prediction'].isnull().sum()
    if missing_predictions > 0:
        print(f"⚠️  Warning: {missing_predictions} participants in prediction_id.csv not found in test data")
        
        # Fill with default prediction (majority class from training)
        try:
            majority_class = final_features_with_target['graduated_final'].mode()[0]
        except (NameError, KeyError):
            # Fallback to most common prediction in test results
            majority_class = test_results['graduated_prediction'].mode()[0] if len(test_results) > 0 else 1
            
        submission_df['graduated_prediction'] = submission_df['graduated_prediction'].fillna(majority_class)
        print(f"   Filled missing predictions with majority class: {majority_class} ({'Lulus' if majority_class == 1 else 'Tidak Lulus'})")
    
    # Rename column to match submission format
    if 'graduated_prediction' in submission_df.columns:
        # Convert 0/1 to "Tidak Lulus"/"Lulus"
        submission_df['label'] = submission_df['graduated_prediction'].map({
            0: 'Tidak Lulus',
            1: 'Lulus'
        })
        # Drop the old column
        submission_df = submission_df.drop('graduated_prediction', axis=1)
    
    # Ensure ID column is uppercase (as per submission format)
    if 'id' in submission_df.columns:
        submission_df = submission_df.rename(columns={'id': 'ID'})
    
    print(f"\n📊 Final Submission Shape: {submission_df.shape}")
    print(f"Submission columns: {list(submission_df.columns)}")
    print(f"\nSubmission sample:")
    print(submission_df.head(10))
    
    # Validate submission format
    if 'ID' in submission_df.columns and 'label' in submission_df.columns:
        print("✅ Submission format validated: ID,label with Lulus/Tidak Lulus")
    else:
        print("⚠️  Warning: Submission format may not match expected format")
    
    # Save submission file
    submission_filename = 'bootcamp_graduation_predictions.csv'
    submission_df.to_csv(submission_filename, index=False)
    print(f"\n💾 Submission file saved as: {submission_filename}")
    
    # Final validation
    if 'label' in submission_df.columns:
        final_dist = submission_df['label'].value_counts()
        print(f"\nFinal Submission Distribution:")
        for label, count in final_dist.items():
            pct = count / len(submission_df) * 100
            print(f"  {label}: {count} ({pct:.1f}%)")
    
except FileNotFoundError:
    print(f"❌ prediction_id.csv not found. Creating submission with all test participants...")
    
    # Create submission with all test results
    submission_df = test_results[['id', 'graduated_prediction']].copy()
    
    # Convert to proper format
    submission_df['label'] = submission_df['graduated_prediction'].map({
        0: 'Tidak Lulus',
        1: 'Lulus'
    })
    submission_df = submission_df.rename(columns={'id': 'ID'})
    submission_df = submission_df[['ID', 'label']]  # Keep only required columns
    
    submission_filename = 'bootcamp_graduation_predictions.csv'
    submission_df.to_csv(submission_filename, index=False)
    print(f"💾 Submission file saved as: {submission_filename}")

except Exception as e:
    print(f"❌ Error in submission preparation: {e}")
    print("Creating basic submission file...")
    
    # Fallback: Create basic submission
    try:
        basic_submission = pd.DataFrame({
            'ID': final_test_features['id'],
            'label': pd.Series(test_predictions_binary).map({
                0: 'Tidak Lulus',
                1: 'Lulus'
            })
        })
        
        submission_filename = 'bootcamp_graduation_predictions.csv'
        basic_submission.to_csv(submission_filename, index=False)
        print(f"💾 Basic submission file saved as: {submission_filename}")
        
    except Exception as e2:
        print(f"❌ Failed to create basic submission: {e2}")

print(f"\n✅ Test data processing and prediction generation completed!")

# Debug final summary
try:
    print(f"📋 Summary:")
    print(f"   • Processed {len(test_participant_ids)} test participants")
    print(f"   • Generated predictions using {best_config}")
    print(f"   • Created submission file: {submission_filename}")
    print(f"   • Ready for Kaggle submission! 🚀")
except NameError as e:
    print(f"📋 Summary (some variables may not be defined): {e}")


📄 Step 6: Prepare Submission File
----------------------------------------
Test results shape: (48, 3)
Test results columns: ['id', 'graduated_probability', 'graduated_prediction']
Test results sample:
                                     id  graduated_probability  \
0  2e73eb02-d3c9-4507-bd18-74911ed44215           2.280991e-17   
1  1c33125d-19b3-44c5-b2f2-e8dcd0863df5           2.155581e-17   
2  04eacef3-adce-4ab6-b5ea-4cbb4499d86a           2.265716e-17   

   graduated_prediction  
0                     0  
1                     0  
2                     0  
✅ Loaded prediction_id.csv: (48, 1)
Columns: ['ID']
Sample data:
                                     ID
0  0028102b-576f-4819-b1df-8c0e7ae0247b
1  008479a4-622a-4a22-8e5a-81e671535445
2  009762eb-c062-41fd-9a2b-144b65f33c3b
Renamed 'ID' column to 'id'

Before merge:
  prediction_ids IDs: 48
  test_results IDs: 48
  Common IDs: 48
✅ Merge completed: (48, 2)

📊 Final Submission Shape: (48, 2)
Submission columns: ['ID', 'label