menguhubungkan ke drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

memanggil dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE

In [None]:

df_test = pd.read_excel('/content/drive/MyDrive/praktikum_ml/praktikum10/data/kelulusan_test.xls')
df_train = pd.read_excel('/content/drive/MyDrive/praktikum_ml/praktikum10/data/kelulusan_train.xls')

df_test

In [None]:
df_train

mengecek stuktur dataset

In [None]:
df_train.info()

In [None]:
df_test.info()

# Data Cleaning

pemeriksaan nilai unik pada kolom kategorikal

In [None]:
print(df_train['JENIS KELAMIN'].unique())
print(df_train['STATUS MAHASISWA'].unique())
print(df_train['STATUS NIKAH'].unique())
print(df_train['STATUS KELULUSAN'].unique())


menghapus kolom yang tidak digunakan

In [None]:
df_train.drop(columns=['STATUS NIKAH'], inplace=True)
df_test.drop(columns=['STATUS NIKAH'], inplace=True)

mengubah nilai kategorikal menjadi numerik

In [None]:
replacements = {
    'JENIS KELAMIN': {'LAKI - LAKI': 1, 'PEREMPUAN': 0},
    'STATUS MAHASISWA': {'MAHASISWA': 0, 'BEKERJA': 1},
    'STATUS KELULUSAN': {'TERLAMBAT': 1, 'TEPAT': 0}
}

df_train = df_train.replace(replacements)
df_test = df_test.replace(replacements)
df_train.head()

melakukan encoding pada dataset uji

In [None]:
df_train = df_train.replace(replacements)
df_test.tail()

cek missing value

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

menangani missing value

In [None]:
df_train = df_train.dropna(subset=['IPS 8'])
df_train = df_train.dropna(subset=['IPK '])

In [None]:
df_test = df_test.dropna(subset=['IPS 8'])
df_test = df_test.dropna(subset=['IPK '])

mengecek kembali missing value

In [None]:
df_test.isnull().sum()

In [None]:
df_train.isnull().sum()

menghapus kolom yang tidak digunakan dalam analisis

In [None]:
df_train = df_train.drop(columns=['NAMA'])
df_train.head()

In [None]:
df_test = df_test.drop(columns=['NAMA'])
df_test.head()

# Analisis korelasi fitur

menghitung matriks korelasi menggunakan spearman

In [None]:
correlation_matrix = df_train.corr(method='spearman')

# membuat heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Heatmap Korelasi (Spearman)")
plt.show()

# Visualisasi Distribusi Status Kelulusan

In [None]:
# membuat diagram batang untuk status kelulusan
plt.figure(figsize=(8, 6))
sns.countplot(x='STATUS KELULUSAN', data=df_train, palette='Set2')

# menambahkan judul dan label
plt.title('Distribusi Status Kelulusan', fontsize=16)
plt.xlabel('Status Kelulusan (1 = Terlambat, 0 = Tepat Waktu)', fontsize=12)
plt.ylabel('Jumlah Mahasiswa', fontsize=12)

# menambahkan legend
plt.legend(title='Status Kelulusan', labels=['Tepat waktu (0), Terlambat (1)'], loc='upper right')

# menampilkan diagram
plt.show()

# Tahapan Modeling

pemisahan data dan normalisasi



In [None]:
# data yang dilatih
x_train = df_train.drop(columns=['STATUS KELULUSAN'])
y_train = df_train['STATUS KELULUSAN']

# data validasinya
x_test = df_test.drop(columns=['STATUS KELULUSAN'])
y_test = df_test['STATUS KELULUSAN']

# normalisasi features after splitting
scaller = StandardScaler()
x_train_scaled = scaller.fit_transform(x_train)
x_test_scaled = scaller.transform(x_test)


# menangani ketidakseimbangan kelas dengan SMOTE

In [None]:
print("Before SMOTE")
print(y_train.value_counts())

sm = SMOTE(random_state=42, k_neighbors=1)
x_train_smote, y_train_smote = sm.fit_resample(x_train_scaled, y_train)

print("After SMOTE")
print(y_train_smote.value_counts())

# menentukan parameter terbaik (syperparameter tuning)

In [None]:
param_grid = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_jobs=1)
grid.fit(x_train_smote, y_train_smote)

print("grid.best_params:", grid.best_params_)
print("best score:", grid.best_score_)

# melatih model KNN terbaik dan melakukan prediks

In [None]:
best_knn = grid.best_estimator_
best_knn.fit(x_train_smote, y_train_smote)

# prediksi
y_pred = best_knn.predict(x_test_scaled)

# evaluasi model drngan classification report

In [None]:
print("\Classification Report:")
print(classification_report(y_test, y_pred))

# evaluasi dengan confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title("Confussion Matrix KNN (Best Params)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# validasi model dengan cross validation

In [None]:
cv_scores = cross_val_score(best_knn, x_train_smote, y_train_smote, cv=5)
print("Cross Validation Accuracy:", cv_scores.mean())
print("Std Dev:", cv_scores.std())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!git config --global user.email "jamilatunnisa996@gmail.com"
!git config --global user.name "Jamilatun Khoerunnisa"

!git clone https://github.com/Jamilatun/ti03_Mila_01101222254.git
%cd ti03_Mila_01101222254

!cp -r /content/drive/MyDrive/praktikum_ml/praktikum10/ ./praktikum10

!ls -lah

!git add .
!git commit -m "Upload praktikum10 dari Colab"

from getpass import getpass
import os

token = getpass("Masukkan GitHub Token: ")

os.system(f'git remote set-url origin https://{token}@github.com/Jamilatun/ti03_Mila_01101222254.git')

!git push origin main