In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gradio as gr
from scipy.stats import linregress
import warnings

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_excel(r"/content/drive/MyDrive/proyek/data_putus_sekolah.xlsx")
df.head()

Unnamed: 0,Tahun,Kecamatan,Laki-laki (L) - Tingkat - X,Laki-laki (L) - Tingkat - XI,Laki-laki (L) - Tingkat - XII,Laki-laki (L) - Subjml,Perempuan (P) - Tingkat - X,Perempuan (P) - Tingkat - XI,Perempuan (P) - Tingkat - XII,Perempuan (P) - Subjml,Laki-laki dan Perempuan - Tingkat - X,Laki-laki dan Perempuan - Tingkat - XI,Laki-laki dan Perempuan - Tingkat - XII,Laki-laki dan Perempuan - Jumlah
0,2021,Kec. Mariso,8,0,0,8,2,3,3,8,10,3,3,16
1,2021,Kec. Mamajang,4,0,0,4,1,1,0,2,5,1,0,6
2,2021,Kec. Tamalate,3,0,0,3,1,0,0,1,4,0,0,4
3,2021,Kec. Makasar,0,1,1,2,0,1,0,1,0,2,1,3
4,2021,Kec. Ujung Pandang,1,0,2,3,1,0,0,1,2,0,2,4


In [4]:
df.dropna(subset=['Tahun', 'Kecamatan', 'Laki-laki (L) - Subjml', 'Perempuan (P) - Subjml'], inplace=True)
df.drop_duplicates(inplace=True)
kecamatan_list = sorted(df['Kecamatan'].unique().tolist())

def calculate_trend(series):
    if len(series) < 2 or series.nunique() < 2:
        return 0.0

    X = np.arange(len(series))
    Y = series.values
    try:
        slope, intercept, r_value, p_value, std_err = linregress(X, Y)
        return slope
    except ValueError:
        return 0.0

df_trend_ll = df.groupby('Kecamatan')['Laki-laki (L) - Subjml'].rolling(window=4, min_periods=2).apply(calculate_trend, raw=False).reset_index(level=0, drop=True).rename('Tren_LL')
df_trend_pp = df.groupby('Kecamatan')['Perempuan (P) - Subjml'].rolling(window=4, min_periods=2).apply(calculate_trend, raw=False).reset_index(level=0, drop=True).rename('Tren_PP')

df = df.join(df_trend_ll).join(df_trend_pp)
df.fillna(0, inplace=True)

df['Risiko_LL'] = df['Laki-laki (L) - Subjml'].apply(lambda x: 1 if x >= df['Laki-laki (L) - Subjml'].quantile(0.50) else 0)
df['Risiko_PP'] = df['Perempuan (P) - Subjml'].apply(lambda x: 1 if x >= df['Perempuan (P) - Subjml'].quantile(0.50) else 0)

df_cls_processed = pd.get_dummies(df, columns=['Kecamatan'], drop_first=True)
features_cls_list = [col for col in df_cls_processed.columns if 'Kecamatan_' in col]
features_cls_list.extend(['Tren_LL', 'Tren_PP', 'Tahun'])

X_cls = df_cls_processed[features_cls_list]
y_ll = df_cls_processed['Risiko_LL']
y_pp = df_cls_processed['Risiko_PP']

X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X_cls, y_ll, test_size=0.3, random_state=42, stratify=y_ll)
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_cls, y_pp, test_size=0.3, random_state=42, stratify=y_pp)

MODEL_LL_CLS = RandomForestClassifier(n_estimators=250, max_depth=7, random_state=42, class_weight='balanced', n_jobs=-1)
MODEL_LL_CLS.fit(X_train_ll, y_train_ll)
y_pred_ll = MODEL_LL_CLS.predict(X_test_ll)

MODEL_PP_CLS = RandomForestClassifier(n_estimators=250, max_depth=7, random_state=42, class_weight='balanced', n_jobs=-1)
MODEL_PP_CLS.fit(X_train_pp, y_train_pp)
y_pred_pp = MODEL_PP_CLS.predict(X_test_pp)

print(" Modeling Gender-Specific Selesai.")

CV_ACCURACY_LL = cross_val_score(MODEL_LL_CLS, X_cls, y_ll, cv=5, scoring='accuracy', n_jobs=-1).mean()
CV_ACCURACY_PP = cross_val_score(MODEL_PP_CLS, X_cls, y_pp, cv=5, scoring='accuracy', n_jobs=-1).mean()
ACCURACY_FINAL_LL = accuracy_score(y_test_ll, y_pred_ll)
ACCURACY_FINAL_PP = accuracy_score(y_test_pp, y_pred_pp)
ACCURACY_REPORT_LL = classification_report(y_test_ll, y_pred_ll, output_dict=True)
ACCURACY_REPORT_PP = classification_report(y_test_pp, y_pred_pp, output_dict=True)
CONFUSION_MATRIX_LL = confusion_matrix(y_test_ll, y_pred_ll)
CONFUSION_MATRIX_PP = confusion_matrix(y_test_pp, y_pred_pp)

global MODEL_LL_CLS, MODEL_PP_CLS, X_CLS_COLUMNS, CV_ACCURACY_LL, CV_ACCURACY_PP, ACCURACY_REPORT_LL, ACCURACY_REPORT_PP, CONFUSION_MATRIX_LL, CONFUSION_MATRIX_PP, ACCURACY_FINAL_LL, ACCURACY_FINAL_PP, X_cls, kecamatan_list
X_CLS_COLUMNS = X_cls.columns.tolist()
print(f"Total Fitur: {len(X_CLS_COLUMNS)}")

 Modeling Gender-Specific Selesai.
Total Fitur: 17


In [5]:
print("\n2. Mendefinisikan Fungsi Logika Gradio...")

def plot_confusion_matrix(cm, title):
    """Membuat plot Confusion Matrix (Fix Gradio)."""
    fig, ax = plt.subplots(figsize=(7, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Risiko Rendah (0)', 'Risiko Tinggi (1)'],
                yticklabels=['Risiko Rendah (0)', 'Risiko Tinggi (1)'], ax=ax)
    ax.set_title(title, fontsize=16)
    ax.set_ylabel('Aktual')
    ax.set_xlabel('Prediksi')
    fig.tight_layout()
    return fig

def plot_feature_importance(model_ll, model_pp, feature_names):
    """Membuat plot Feature Importance per Gender (Fix Gradio)."""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    importances_ll = pd.Series(model_ll.feature_importances_, index=feature_names).sort_values(ascending=False).head(5)
    sns.barplot(x=importances_ll.values, y=importances_ll.index, ax=axes[0], palette="Blues_d")
    axes[0].set_title('Top 5 Feature Importance (Laki-laki)')
    axes[0].set_yticklabels([f.replace('Kecamatan_', 'Kec. ') for f in importances_ll.index], fontsize=10)

    importances_pp = pd.Series(model_pp.feature_importances_, index=feature_names).sort_values(ascending=False).head(5)
    sns.barplot(x=importances_pp.values, y=importances_pp.index, ax=axes[1], palette="Reds_d")
    axes[1].set_title('Top 5 Feature Importance (Perempuan)')
    axes[1].set_yticklabels([f.replace('Kecamatan_', 'Kec. ') for f in importances_pp.index], fontsize=10)

    plt.suptitle('Visualisasi Algoritma: Feature Importance Berdasarkan Gender', fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    return fig

def create_combined_plot(proba_ll, proba_pp, selected_kecamatan, tahun_prediksi):
    """Membuat bar plot gabungan untuk probabilitas per gender (Fix Grafik Gradio)."""
    df_plot = pd.DataFrame({
        'Gender': ['Laki-laki', 'Perempuan'],
        'Probabilitas Risiko Tinggi': [proba_ll[1], proba_pp[1]]
    })

    fig, ax = plt.subplots(figsize=(8, 5))
    sns.barplot(x='Gender', y='Probabilitas Risiko Tinggi', data=df_plot, palette=['#3498db', '#e74c3c'], ax=ax)
    ax.set_title(f'Probabilitas Risiko Tinggi di {selected_kecamatan} ({tahun_prediksi})')
    ax.set_ylim(0, 1)
    ax.axhline(0.5, color='gray', linestyle='--', linewidth=1, label='Batas Keputusan (0.5)')
    ax.legend(loc='upper right')
    fig.tight_layout()
    return fig

def predict_risk_gendered(selected_kecamatan, tahun_input):
    """Prediksi risiko per gender (Logika Tahun & Tren diekstrapolasi)."""

    try:
        tahun_prediksi = int(tahun_input)
    except ValueError:
        return "ERROR: Tahun harus berupa angka bulat.", None, None

    Y_LAST_TRAIN = 2024

    year_offset = tahun_prediksi - Y_LAST_TRAIN

    input_features = np.zeros(len(X_CLS_COLUMNS))
    input_df_cls = pd.DataFrame([input_features], columns=X_CLS_COLUMNS)

    kec_col_name = f'Kecamatan_{selected_kecamatan}'
    if kec_col_name in input_df_cls.columns:
        input_df_cls[kec_col_name] = 1

    input_df_cls['Tahun'] = tahun_prediksi

    base_trend_ll = X_cls['Tren_LL'].replace(0, np.nan).dropna().iloc[-1] if not X_cls['Tren_LL'].empty and not X_cls['Tren_LL'].replace(0, np.nan).dropna().empty else 0
    base_trend_pp = X_cls['Tren_PP'].replace(0, np.nan).dropna().iloc[-1] if not X_cls['Tren_PP'].empty and not X_cls['Tren_PP'].replace(0, np.nan).dropna().empty else 0

    input_df_cls['Tren_LL'] = base_trend_ll * year_offset
    input_df_cls['Tren_PP'] = base_trend_pp * year_offset

    try:
        proba_ll = MODEL_LL_CLS.predict_proba(input_df_cls)[0]
        pred_ll = MODEL_LL_CLS.predict(input_df_cls)[0]

        proba_pp = MODEL_PP_CLS.predict_proba(input_df_cls)[0]
        pred_pp = MODEL_PP_CLS.predict(input_df_cls)[0]
    except Exception as e:
        return f"ERROR saat memprediksi: {e}", None, None

    ll_status = "TINGGI" if pred_ll == 1 else "Rendah"
    ll_warna = "darkred" if pred_ll == 1 else "darkgreen"
    pp_status = "TINGGI" if pred_pp == 1 else "Rendah"
    pp_warna = "darkred" if pred_pp == 1 else "darkgreen"

    if pred_ll == 1 and pred_pp == 1:
        kesimpulan = " Keduanya: Risiko sangat tinggi untuk **Laki-laki** dan **Perempuan**. Perlu intervensi total."
    elif pred_ll == 1:
        kesimpulan = " Perhatian pada **Laki-laki**: Risiko LL tinggi, PP rendah. Fokus intervensi pada program yang menargetkan siswa LL."
    elif pred_pp == 1:
        kesimpulan = " Perhatian pada **Perempuan**: Risiko PP tinggi, LL rendah. Fokus intervensi pada program yang menargetkan siswa PP."
    else:
        kesimpulan = " Aman: Risiko rendah untuk kedua gender. Lakukan monitoring rutin."

    summary_text = (
        f" Hasil Prediksi Risiko Putus Sekolah (Tahun {tahun_prediksi})\n"
        f" Lokasi: {selected_kecamatan} \n"
        f" Analisis Gender:\n"
        f" Laki-laki (LL): Status Risiko: <span style='color:{ll_warna}; font-weight: bold;'>{ll_status}</span> (Probabilitas Tinggi: {proba_ll[1]*100:.0f}%)\n"
        f" Perempuan (PP): Status Risiko: <span style='color:{pp_warna}; font-weight: bold;'>{pp_status}</span> (Probabilitas Tinggi: {proba_pp[1]*100:.0f}%)\n"
        f" Kesimpulan & Rekomendasi:\n"
        f" {kesimpulan} "
    )

    df_proba = pd.DataFrame({
        'Gender': ['Laki-laki', 'Perempuan'],
        'Status Risiko': [ll_status, pp_status],
        'Probabilitas Risiko Tinggi (%)': [f"{proba_ll[1]*100:.2f}%", f"{proba_pp[1]*100:.2f}%"]
    })

    plot_fig = create_combined_plot(proba_ll, proba_pp, selected_kecamatan, tahun_prediksi)

    return summary_text, plot_fig, df_proba

def show_evaluation_results_gendered():
    """ Menampilkan laporan akurasi, Confusion Matrix, dan Feature Importance per Gender."""

    accuracy_report_md = (
        f" Laporan Akurasi Model Klasifikasi (Gender-Specific)\n"
        f"Model: Random Forest Classifier\n"
        f" Model Laki-laki (LL):\n"
        f" Rata-rata Akurasi CV (5 Folds): `{CV_ACCURACY_LL:.4f}`\n"
        f" Akurasi Test Set: `{ACCURACY_FINAL_LL:.4f}`\n"
        f" Precision (Risiko Tinggi):** `{ACCURACY_REPORT_LL['1']['precision']:.4f}`\n"
        f"\n Model Perempuan (PP):\n"
        f" Rata-rata Akurasi CV (5 Folds): `{CV_ACCURACY_PP:.4f}`\n"
        f" Akurasi Test Set: `{ACCURACY_FINAL_PP:.4f}`\n"
        f" Precision (Risiko Tinggi):** `{ACCURACY_REPORT_PP['1']['precision']:.4f}`\n"
    )

    feature_plot_fig = plot_feature_importance(MODEL_LL_CLS, MODEL_PP_CLS, X_CLS_COLUMNS)


    cm_plot_ll = plot_confusion_matrix(CONFUSION_MATRIX_LL, 'Confusion Matrix Laki-laki')


    cm_plot_pp = plot_confusion_matrix(CONFUSION_MATRIX_PP, 'Confusion Matrix Perempuan')




    return accuracy_report_md, feature_plot_fig, cm_plot_ll, cm_plot_pp


2. Mendefinisikan Fungsi Logika Gradio...


In [6]:
print("\n3. Meluncurkan Antarmuka Gradio (Tata Letak Evaluasi Diperbaiki)...")

theme = gr.themes.Soft(primary_hue="sky").set(body_background_fill="#f7f7f7")

with gr.Blocks(theme=theme, title="Deployment Analisis Gender dan Prediksi Risiko Putus Sekolah") as demo:

    gr.HTML(
        """
        <div style="text-align: center; padding: 20px; background-color: #0ea5e9; color: white; border-radius: 10px;">
        <h1 style="margin: 0; font-size: 2.2em;"> Analisis Gender dan Prediksi Risiko Putus Sekolah Siswa SMA/MA</h1>
        <p style="margin: 5px 0 0; font-style: italic;">Sistem Identifikasi Risiko Berbasis Gender dan Tren</p>
        </div>
        """
    )

    with gr.Tab(" Prediksi Risiko Tahun Berikutnya"):
        with gr.Row(variant="panel"):
            with gr.Column(scale=1, variant="panel"):
                gr.Markdown(" Pilih Parameter Prediksi")
                Tahun_Input = gr.Number(label="Tahun Prediksi yang Diinginkan", value=2026, minimum=2025, step=1, interactive=True)
                kecamatan_input = gr.Dropdown(label="Pilih Kecamatan", choices=kecamatan_list, value=kecamatan_list[0], interactive=True)
                predict_btn = gr.Button("LIHAT ANALISIS GENDER & PREDIKSI", variant="primary")

            with gr.Column(scale=2, variant="panel"):
                gr.Markdown(" Hasil Prediksi dan Analisis Gender")
                output_summary = gr.Markdown(label="Ringkasan Prediksi", value="Pilih tahun dan kecamatan, lalu klik tombol.", visible=True)
                output_plot = gr.Plot(label="Perbandingan Probabilitas Risiko Tinggi (Laki-laki vs Perempuan)", show_label=True)
                output_table = gr.Dataframe(label="Probabilitas Risiko (%)", interactive=False, visible=True)

        predict_btn.click(
            fn=predict_risk_gendered,
            inputs=[kecamatan_input, Tahun_Input],
            outputs=[output_summary, output_plot, output_table]
        )

    with gr.Tab("ðŸ”¬ Evaluasi & Algoritma"):
        gr.Markdown(" Laporan Akurasi Model & Visualisasi Algoritma")

        with gr.Row():

            with gr.Column(scale=1):
                evaluation_btn = gr.Button("TAMPILKAN EVALUASI GENDER-SPECIFIC", variant="secondary", min_width=250)
                gr.Markdown("")

            with gr.Column(scale=3):
                accuracy_text_output = gr.Markdown(label="Laporan Akurasi Klasifikasi")

                feature_plot_output = gr.Plot(label="Feature Importance (Per Gender)")

                gr.Markdown(" Confusion Matrix per Gender")
                with gr.Row():
                    cm_plot_ll = gr.Plot(label="Confusion Matrix Laki-laki")
                    cm_plot_pp = gr.Plot(label="Confusion Matrix Perempuan")


        evaluation_btn.click(
            fn=show_evaluation_results_gendered,
            inputs=[],
            outputs=[accuracy_text_output, feature_plot_output, cm_plot_ll, cm_plot_pp]
        )

demo.launch(share=True)


3. Meluncurkan Antarmuka Gradio (Tata Letak Evaluasi Diperbaiki)...


  with gr.Blocks(theme=theme, title="Deployment Analisis Gender dan Prediksi Risiko Putus Sekolah") as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://615040a1a9f5a445e7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


