# Proyek Akhir: Menyelesaikan Permasalahan Perusahaan Edutech

- Nama : Nisa Agni Afifah
- Email : agniafifah21@gmail.com
- Id Dicoding : ichaa_agni

## Persiapan

### Menyiapkan library yang dibutuhkan

In [None]:
# Import semua library yang diperlukan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import xgboost as xgb
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_curve, roc_auc_score, classification_report, f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

: 

### Menyiapkan data yang akan diguankan

In [None]:
students_df = pd.read_csv("data.csv", sep=";")

## Data Understanding

In [None]:
# Menampilkan 5 baris pertama dari dataframe
students_df.head()

In [None]:
# Menampilkan jumlah baris & kolom dataframe
students_df.shape

In [None]:
# Menampilkan tipe data dari variable dataframe
students_df.dtypes

In [None]:
# Menampilkan statistik deskriptif dari dataframe
students_df.describe(include='object')

In [None]:
students_df.describe()

In [None]:
# Menampilkan jumlah data duplikat dari dataframe
students_df.duplicated().sum()

In [None]:
# Menampilkan missing value dari dataframe
students_df.isna().sum()

## Data Preparation / Preprocessing

### Exploratory Data Analysis

In [None]:
# Menampilkan nilai unik dari kolom "Status"
students_df['Status'].value_counts()

In [None]:
students_df = students_df[students_df.Status!='Enrolled']

In [None]:
# Menampilkan distribusi kolom "Status" dalam bentuk pie chart
status_counts = students_df['Status'].value_counts()
labels = status_counts.index
sizes = status_counts.values
colors = sns.color_palette('pastel', len(labels))  # pindahkan ke bawah setelah 'labels'

# Plotting
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, colors=colors, startangle=90, autopct='%1.1f%%', textprops={'fontsize': 14})
plt.title('Distribution of Status')
plt.axis('equal')
plt.show()

In [None]:
students_df['Status']=students_df['Status'].map({'Dropout':0,
                                                 'Graduate':1
})

In [None]:
# Kolom numerik
numerical_cols = students_df.select_dtypes(include=['int64', 'float64']).columns

# Plot distribusi
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols[:6]):
    plt.subplot(2, 3, i+1)
    sns.histplot(students_df[col], kde=True)
    plt.title(f'Distribusi {col}')
plt.tight_layout()
plt.show()

In [None]:
# Kolom kategorikal
categorical_cols = students_df.select_dtypes(include='object').columns

# Plot frekuensi
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols[:6]):
    ax = plt.subplot(2, 3, i + 1)
    students_df[col].value_counts().plot(kind='bar', ax=ax)
    ax.set_title(f'Frekuensi {col}')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Matriks korelasi
plt.figure(figsize=(12, 8))
corr_matrix = students_df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Matriks Korelasi")
plt.show()

In [None]:
# Menampilkan korelasi antara "Status" dan variabel lainnya
students_df.corr()['Status']

In [None]:
# Menampilkan korelasi antara "Status" dan variabel lain dalam bentuk bar chart yang diurutkan secara Descending
correlations = students_df.corr()['Status']
sorted_corr = correlations.abs().sort_values(ascending=False)

plt.figure(figsize=(15, 10))
plt.bar(sorted_corr.index, sorted_corr.values)
plt.xlabel('Features')
plt.ylabel('Correlation with Status')
plt.title('Features Sorted by Correlation with Status')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Mengubah nilai beberapa variabel dari numerik menjadi string

course_mapping = {
        33: "Biofuel Production Technologies",
        171: "Animation and Multimedia Design",
        8014: "Social Service (evening attendance)",
        9003: "Agronomy",
        9070: "Communication Design",
        9085: "Veterinary Nursing",
        9119: "Informatics Engineering",
        9130: "Equinculture",
        9147: "Management",
        9238: "Social Service",
        9254: "Tourism",
        9500: "Nursing",
        9556: "Oral Hygiene",
        9670: "Advertising and Marketing Management",
        9773: "Journalism and Communication",
        9853: "Basic Education",
        9991: "Management (evening attendance)"
}

students_df['Gender'] = students_df['Gender'].astype(str).replace({'0': 'Male', '1': 'Female'})
students_df['Displaced'] = students_df['Displaced'].astype(str).replace({'0': 'No', '1': 'Yes'})
students_df['Debtor'] = students_df['Debtor'].astype(str).replace({'0': 'No', '1': 'Yes'})

students_df['Course'] = students_df['Course'].replace(course_mapping).astype(str)
le_course = LabelEncoder()
students_df['Course'] = le_course.fit_transform(students_df['Course'])

students_df['Scholarship_holder'] = students_df['Scholarship_holder'].astype(str).replace({'0': 'No', '1': 'Yes'})
students_df['Tuition_fees_up_to_date'] = students_df['Tuition_fees_up_to_date'].astype(str).replace({'0': 'No', '1': 'Yes'})
students_df['Daytime_evening_attendance'] = students_df['Daytime_evening_attendance'].astype(str).replace({'0': 'Evening', '1': 'Daytime'})
students_df['Status'] = students_df['Status'].astype(str).replace({'0': 'Dropout', '1': 'Graduate'})

In [None]:
students_df.to_csv('data_clean.csv', index=False)

## Modeling

In [None]:
# Definisikan kolom target

target_col = 'Status'
selected_features = [
    'Course',
    'Daytime_evening_attendance',
    'Displaced',
    'Debtor',
    'Tuition_fees_up_to_date',
    'Gender',
    'Scholarship_holder',
    'Age_at_enrollment',
    'Curricular_units_1st_sem_credited',
    'Curricular_units_1st_sem_enrolled',
    'Curricular_units_1st_sem_approved',
    'Curricular_units_1st_sem_grade',
    'Curricular_units_2nd_sem_credited',
    'Curricular_units_2nd_sem_enrolled',
    'Curricular_units_2nd_sem_approved',
    'Curricular_units_2nd_sem_grade'
]

df_selected = students_df[selected_features + [target_col]].copy()

In [None]:
# Pisahkan kolom kategorikal dan numerik dari fitur terpilih
categorical_cols = df_selected.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_selected.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Kolom kategorikal:", categorical_cols)
print("Kolom numerik:", numerical_cols)

In [None]:
# Encode kolom kategorikal

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_selected[col] = le.fit_transform(df_selected[col])
    encoders[col] = le

In [None]:
# Encode target kolom
le_status = LabelEncoder()
df_selected[target_col] = le_status.fit_transform(df_selected[target_col])
encoders[target_col] = le_status

In [None]:
# Korelasi numerik
corr_matrix = df_selected[numerical_cols + [target_col]].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Korelasi Fitur Numerik dengan Target")
plt.show()

In [None]:
# Cek Multikolinearitas
high_corr_pairs = (corr_matrix.abs() > 0.8) & (corr_matrix != 1.0)
if high_corr_pairs.any().any():
    print("\n⚠️ Terdeteksi korelasi tinggi antar fitur:")
    print(corr_matrix[high_corr_pairs].stack())
else:
    print("\n✅ Tidak ada multikolinearitas parah (korelasi > 0.8).")

In [None]:
# Standarisasi numerik
scaler = StandardScaler()
df_selected[numerical_cols] = scaler.fit_transform(df_selected[numerical_cols])

In [None]:
# Siapkan fitur dan target
X = df_selected[selected_features].copy()
y = df_selected[target_col].copy()

# Standarisasi numerikal
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Jumlah data latih:", X_train.shape)
print("Jumlah data uji:", X_test.shape)


In [None]:
# Melatih model
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# Pelatihan dan evaluasi
results = []
trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results.append({
        "Model": name,
        "Akurasi": acc,
        "Precision": report['weighted avg']['precision'],
        "Recall": report['weighted avg']['recall'],
        "F1-Score": report['weighted avg']['f1-score']
    })

    trained_models[name] = model

## Evaluation

In [None]:
# Tampilkan evaluasi dan pilih model terbaik
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Akurasi", ascending=False).reset_index(drop=True)

print("📊 Hasil Evaluasi Model:\n")
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]
print(f"\n✅ Model terbaik adalah: {best_model_name} dengan akurasi: {results_df.iloc[0]['Akurasi']:.4f}")

## Deployment


In [None]:
import os
import joblib

# Buat folder model jika belum ada
os.makedirs("model", exist_ok=True)

# Simpan model, encoders, dan scaler (jika digunakan)
model_data = {
    "model": model,
    "encoders": encoders,
    "scaler": scaler,
    "categorical_cols": categorical_cols,
    "numerical_cols": numerical_cols,
    "feature_names": X.columns.tolist()  # Untuk jaga urutan saat prediksi
}

# Simpan ke file .joblib
joblib.dump(model_data, "model/best_model.joblib")

print("\n✅ Model, encoder, scaler berhasil disimpan ke model/best_model.joblib")
