In [None]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_curve, auc

# Graphic Settings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print(">>> The libraries are successfully loaded.")

In [None]:
# We import our dataset. If an error occurs, an alert is displayed.
try:
    df = pd.read_csv('data.csv', sep=';')
    print(f">>> Veri Seti Yüklendi. Boyut: {df.shape}")
except FileNotFoundError:
    print("HATA: 'data.csv' dosyası bulunamadı.Tekrar deneyin.")

In [None]:
df.shape

In [None]:
df.head(20)

In [None]:
# Overview of the dataset
df.info()

In [None]:
# checking total null values(missing values)
df.isnull().sum()

In [None]:
data_pie = df['Target'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# PIE CHART
ax1.pie(
    data_pie,
    autopct="%.2f%%",
    labels=["Dropout", "Enrolled", "Graduate"],
    colors=sns.color_palette('Set2') 
)
ax1.set_title('Percentage of students')

# BAR CHART
sns.barplot(
    x=["Dropout", "Enrolled", "Graduate"],
    y=data_pie.values,
    palette="Set2",                 
    ax=ax2
)

ax2.set_xlabel('Target Categories')
ax2.set_ylabel('Number of students')
ax2.set_title('Total number of students')

# LABELS
for i, value in enumerate(data_pie.values):
    ax2.text(i, value, value, ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# We derive additional features to increase the model’s predictive capabilities.

print(">>> Feature Engineering Process")

# A) Rate of Passing Lesson (Approved / Enrolled)
df['App_Rate_1st'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)'].replace(0, 1)
df['App_Rate_2nd'] = df['Curricular units 2nd sem (approved)'] / df['Curricular units 2nd sem (enrolled)'].replace(0, 1)

# B) Average Grades
df['Grade_Avg'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2

# C) Change of the performance(Second Term Grade-First Term Grade)
df['Grade_Change'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)']

# D) Economic Pressure 
df['Eco_Stress'] = df['Unemployment rate'] * df['Inflation rate']

print(f">>> To increase performance our model new features added. Current Column Number: {df.shape[1]}")

In [None]:
print(">>> Data Preprocessing Steps!!!")

# 1. Target Variable Transformation: Converting to Dropout vs Not Dropout
df['Target_Binary'] = df['Target'].apply(lambda x: 'Dropout' if x == 'Dropout' else 'Not Dropout')

X = df.drop(['Target', 'Target_Binary'], axis=1)
y = df['Target_Binary']

# 3. Implementation of Label Encoding (Dropout:0 and Not Dropout:1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f">>> Encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# 4. Splitting Dataset (%80 Train - %20 Test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 5. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(">>> Data has been succesfully splitted and scaled.")

In [None]:
#New shape
df.shape

In [None]:
df.head(20)

In [None]:
# New target distribution visualization

target_counts = df['Target_Binary'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

colors = sns.color_palette("Set2")

# PIE CHART
ax1.pie(
    target_counts.values,
    labels=target_counts.index,
    autopct="%.2f%%",
    startangle=90,
    colors=colors
)
ax1.set_title("Target Distribution With Percentages")

# BAR CHART
sns.barplot(
    x=target_counts.index,
    y=target_counts.values,
    palette=colors,
    ax=ax2
)

ax2.set_title("Target Distribution")
ax2.set_xlabel("Target Class")
ax2.set_ylabel("Number of Students")

for i, value in enumerate(target_counts.values):
    ax2.text(i, value, value, ha='center', va='bottom', fontsize=13)

plt.tight_layout()
plt.show()


In [None]:
print(">>> Baseline Model (Dummy) is training...")

# Strategy 'stratified': randomly assigns labels according to the class distribution in the training set.
dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
dummy_clf.fit(X_train_scaled, y_train)
dummy_pred = dummy_clf.predict(X_test_scaled)

print(f"Baseline Accuracy: %{accuracy_score(y_test, dummy_pred)*100:.2f}")

In [None]:
print(">>> Initiating Hyperparameter Tuning for the Random Forest model using GridSearchCV...")

rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Search space of parameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# 3-Fold Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)

best_rf = grid_search.best_estimator_
print(f"\n>>> Best Parameters: {grid_search.best_params_}")

In [None]:
print(">>> Voting Classifier (RF + HGB + GB) ...")

# GradientBoosting
hgb_clf = HistGradientBoostingClassifier(learning_rate=0.1, max_iter=200, random_state=42)
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Voting: Tuned Random Forest + HGB + GB
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('hgb', hgb_clf), ('gb', gb_clf)],
    voting='soft'
)

voting_clf.fit(X_train_scaled, y_train)
print(">>> Voting Classifier training completed.")

In [None]:
print(">>> Evaluation is being performed on the test set...")

test_pred = voting_clf.predict(X_test_scaled)
test_probs = voting_clf.predict_proba(X_test_scaled)[:, 1]
# Metrics
print(f"\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
print(f" TEST ACCURACY: %{accuracy_score(y_test, test_pred)*100:.2f}")
print(f" TEST F1-SCORE: %{f1_score(y_test, test_pred, average='weighted')*100:.2f}")
print(f"\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
print(classification_report(y_test, test_pred, target_names=le.classes_))

# Picture1: Confusion Matrix 
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
cm = confusion_matrix(y_test, test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('Gerçek Durum')
plt.xlabel('Tahmin Edilen Durum')

# Picture2: ROC Curve 
plt.subplot(1, 2, 2)
fpr, tpr, thresholds = roc_curve(y_test, test_probs)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")

plt.tight_layout()
plt.show()

# Picture3: Feature Importance
plt.figure(figsize=(10, 6))
importances = pd.Series(best_rf.feature_importances_, index=X.columns)
importances.nlargest(15).sort_values(ascending=True).plot(kind='barh', color='#2ecc71')
plt.title('Feature Importance (Tuned Random Forest)')
plt.xlabel('Önem Derecesi')
plt.show()

In [None]:
feature_importances = best_rf.feature_importances_
plt.figure(figsize=(10, 8))
sns.barplot(x=feature_importances, y=X.columns)
plt.title("Feature Importance")
plt.show()