In [6]:
import pandas as pd

df = pd.read_csv('data.csv',delimiter=';')

df['Target'].value_counts()

df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [10]:
X = df.drop(columns=['Target'])
y = df['Target']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Define which columns are numerical and which is the course column
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
course_column = ['Course']

# Remove 'Course' from numerical_columns if it's there
if 'Course' in numerical_columns:
    numerical_columns.remove('Course')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('course', OneHotEncoder(handle_unknown='ignore'), course_column)
    ],
    remainder='passthrough'
)

X_preprocessed = preprocessor.fit_transform(X)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"X_preprocessed shape: {X_preprocessed.shape}")
print(f"y_encoded shape: {y_encoded.shape}")
print(f"Classes: {le.classes_}")

[0 2 0 ... 0 2 2]
X_preprocessed shape: (4424, 52)
y_encoded shape: (4424,)
Classes: ['Dropout' 'Enrolled' 'Graduate']


In [20]:
from sklearn.model_selection import train_test_split

# Data Splitting - using common MLOps best practices
RANDOM_STATE = 42
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, 
    y_encoded, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE,
    stratify=y_encoded  # Maintains class distribution in train/test split
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training set class distribution: {pd.Series(y_train).value_counts()}")
print(f"Test set class distribution: {pd.Series(y_test).value_counts()}")

Training set size: 3539
Test set size: 885
Training set class distribution: 2    1767
0    1137
1     635
Name: count, dtype: int64
Test set class distribution: 2    442
0    284
1    159
Name: count, dtype: int64


In [24]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score,recall_score,f1_score
import time

# Define hyperparameters for each model
# You can easily tune these parameters here
params = {
    'random_forest': {
        'n_estimators': 100,
        'max_depth': 10,
        'random_state': RANDOM_STATE,
        'n_jobs': -1
    },
    'logistic_regression': {
        'max_iter': 1000,
        'random_state': RANDOM_STATE,
        'n_jobs': -1
    },
    'svc': {
        'kernel': 'rbf',
        'random_state': RANDOM_STATE
    },
    'decision_tree': {
        'max_depth': 10,
        'random_state': RANDOM_STATE
    },
    'gradient_boosting': {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 5,
        'random_state': RANDOM_STATE
    },
    'adaboost': {
        'n_estimators': 50,
        'learning_rate': 1.0,
        'random_state': RANDOM_STATE
    },
    'knn': {
        'n_neighbors': 5,
        'n_jobs': -1
    },
    'gaussian_nb': {}
}

# Initialize models with specified parameters
models = {
    'RandomForest': RandomForestClassifier(**params['random_forest']),
    'LogisticRegression': LogisticRegression(**params['logistic_regression']),
    'SVC': SVC(**params['svc']),
    'DecisionTree': DecisionTreeClassifier(**params['decision_tree']),
    'GradientBoosting': GradientBoostingClassifier(**params['gradient_boosting']),
    'AdaBoost': AdaBoostClassifier(**params['adaboost']),
    'KNN': KNeighborsClassifier(**params['knn']),
    'GaussianNB': GaussianNB(**params['gaussian_nb'])
}

print("Models initialized successfully!")
print(f"Number of models: {len(models)}")

Models initialized successfully!
Number of models: 8


In [29]:
# Train all models and store results
results = {}

print("Training models...\n")
print("="*70)

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Track training time
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate training time
    training_time = time.time() - start_time
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test,y_pred,average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    results[model_name] = {
        'model': model,
        'accuracy': accuracy,
        'training_time': training_time,
        'predictions': y_pred
    }
    
    print(f"  ‚úì Accuracy: {accuracy:.4f}")
    print(f"  ‚úì Precision: {precision:.4f}")
    print(f"  ‚úì Recall: {recall:.4f}")
    print(f"  ‚úì f1: {f1:.4f}")
    print(f"  ‚úì Training time: {training_time:.2f} seconds")

print("\n" + "="*70)
print("All models trained successfully!")

Training models...


Training RandomForest...
  ‚úì Accuracy: 0.7695
  ‚úì Precision: 0.7510
  ‚úì Recall: 0.7695
  ‚úì f1: 0.7455
  ‚úì Training time: 0.22 seconds

Training LogisticRegression...
  ‚úì Accuracy: 0.7718
  ‚úì Precision: 0.7542
  ‚úì Recall: 0.7718
  ‚úì f1: 0.7561
  ‚úì Training time: 2.17 seconds

Training SVC...
  ‚úì Accuracy: 0.7582
  ‚úì Precision: 0.7450
  ‚úì Recall: 0.7582
  ‚úì f1: 0.7449
  ‚úì Training time: 0.55 seconds

Training DecisionTree...
  ‚úì Accuracy: 0.7062
  ‚úì Precision: 0.7101
  ‚úì Recall: 0.7062
  ‚úì f1: 0.7061
  ‚úì Training time: 0.03 seconds

Training GradientBoosting...
  ‚úì Accuracy: 0.7605
  ‚úì Precision: 0.7541
  ‚úì Recall: 0.7605
  ‚úì f1: 0.7552
  ‚úì Training time: 5.16 seconds

Training AdaBoost...
  ‚úì Accuracy: 0.7503
  ‚úì Precision: 0.7338
  ‚úì Recall: 0.7503
  ‚úì f1: 0.7382
  ‚úì Training time: 0.23 seconds

Training KNN...
  ‚úì Accuracy: 0.6678
  ‚úì Precision: 0.6474
  ‚úì Recall: 0.6678
  ‚úì f1: 0.6551
  ‚úì Train

In [34]:
# Install MLflow if not already installed
# !pip install mlflow

import mlflow
import mlflow.sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import os

def setup_mlflow():
    """Setup MLflow tracking with local file storage"""
    # Use local file-based tracking (no server needed!)
    mlflow_tracking_uri = "./mlruns"
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    
    experiment_name = "Hyperparameter_Tuning"
    
    # Create or get experiment
    try:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"‚úì Created new experiment: {experiment_name}")
    except:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        experiment_id = experiment.experiment_id
        print(f"‚úì Using existing experiment: {experiment_name}")
    
    mlflow.set_experiment(experiment_name)
    print(f"‚úì MLflow tracking URI: {mlflow_tracking_uri}")
    print(f"‚úì Experiment ID: {experiment_id}")
    print(f"\nTo view results later, run in terminal:")
    print(f"  mlflow ui --backend-store-uri {mlflow_tracking_uri}")
    
    return experiment_name

# Setup MLflow
experiment_name = setup_mlflow()

‚úì Created new experiment: Hyperparameter_Tuning
‚úì MLflow tracking URI: ./mlruns
‚úì Experiment ID: 938850411266971545

To view results later, run in terminal:
  mlflow ui --backend-store-uri ./mlruns


  return FileStore(store_uri, store_uri)


In [35]:
# Define models and their hyperparameter search spaces
models_to_tune = {
    'RandomForest': (RandomForestClassifier(random_state=RANDOM_STATE), {
        'n_estimators': [500],
        'max_depth': [10, None],
        'min_samples_split': [10],
        'min_samples_leaf': [1],
        'max_features': ['sqrt']
    }),
    'GradientBoosting': (GradientBoostingClassifier(random_state=RANDOM_STATE), {
        'n_estimators': [400],
        'learning_rate': [0.1],
        'max_depth': [4],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'max_features': ['sqrt']
    })
}

print("Models configured for hyperparameter tuning:")
for model_name in models_to_tune.keys():
    print(f"  ‚Ä¢ {model_name}")

Models configured for hyperparameter tuning:
  ‚Ä¢ RandomForest
  ‚Ä¢ GradientBoosting


In [36]:
def hyperparameter_tuning(model_name, model, param_dist, X_train, y_train, X_val, y_val, n_iter=10, cv=5):
    """
    Perform hyperparameter tuning with RandomizedSearchCV and log results to MLflow
    
    Parameters:
    - model_name: Name of the model
    - model: Sklearn model instance
    - param_dist: Dictionary of hyperparameter distributions
    - X_train, y_train: Training data
    - X_val, y_val: Validation data
    - n_iter: Number of parameter settings sampled
    - cv: Number of cross-validation folds
    
    Returns:
    - best_model: The best trained model
    - best_params: The best hyperparameters
    - best_score: The best validation score
    """
    
    print(f"\n{'='*70}")
    print(f"Tuning {model_name}...")
    print(f"{'='*70}")
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"{model_name}_tuning"):
        
        # Perform RandomizedSearchCV
        random_search = RandomizedSearchCV(
            model, 
            param_distributions=param_dist, 
            n_iter=n_iter, 
            cv=cv, 
            scoring='f1_weighted',  # Good for multi-class imbalanced data
            n_jobs=-1, 
            verbose=2, 
            random_state=RANDOM_STATE
        )
        
        # Fit the model
        start_time = time.time()
        random_search.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Get best model and parameters
        best_model = random_search.best_estimator_
        best_params = random_search.best_params_
        best_cv_score = random_search.best_score_
        
        # Evaluate on validation set
        y_val_pred = best_model.predict(X_val)
        
        val_accuracy = accuracy_score(y_val, y_val_pred)
        val_precision = precision_score(y_val, y_val_pred, average='weighted')
        val_recall = recall_score(y_val, y_val_pred, average='weighted')
        val_f1 = f1_score(y_val, y_val_pred, average='weighted')
        
        # Log parameters to MLflow
        mlflow.log_params(best_params)
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("n_iter", n_iter)
        mlflow.log_param("cv_folds", cv)
        
        # Log metrics to MLflow
        mlflow.log_metric("best_cv_f1_score", best_cv_score)
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.log_metric("val_precision", val_precision)
        mlflow.log_metric("val_recall", val_recall)
        mlflow.log_metric("val_f1_score", val_f1)
        mlflow.log_metric("training_time_seconds", training_time)
        
        # Log the model to MLflow
        mlflow.sklearn.log_model(best_model, f"{model_name}_model")
        
        # Print results
        print(f"\n‚úì Best CV F1 Score: {best_cv_score:.4f}")
        print(f"‚úì Validation Accuracy: {val_accuracy:.4f}")
        print(f"‚úì Validation F1 Score: {val_f1:.4f}")
        print(f"‚úì Training Time: {training_time:.2f} seconds")
        print(f"\nBest Parameters:")
        for param, value in best_params.items():
            print(f"  ‚Ä¢ {param}: {value}")
        
        return best_model, best_params, val_f1

print("‚úì Hyperparameter tuning function ready")

‚úì Hyperparameter tuning function ready


In [37]:
# Run hyperparameter tuning for all models
tuned_results = {}

print("\n" + "="*70)
print("STARTING HYPERPARAMETER TUNING WITH MLFLOW TRACKING")
print("="*70)
print(f"\nView live results at: http://localhost:5000")

for model_name, (model, param_dist) in models_to_tune.items():
    
    # Run hyperparameter tuning
    best_model, best_params, val_f1 = hyperparameter_tuning(
        model_name=model_name,
        model=model,
        param_dist=param_dist,
        X_train=X_train,
        y_train=y_train,
        X_val=X_test,  # Using test set as validation
        y_val=y_test,
        n_iter=10,  # Number of random combinations to try
        cv=5  # 5-fold cross-validation
    )
    
    # Store results
    tuned_results[model_name] = {
        'model': best_model,
        'params': best_params,
        'val_f1_score': val_f1
    }

print("\n" + "="*70)
print("HYPERPARAMETER TUNING COMPLETED!")
print("="*70)

# Display summary
print("\nSummary of Tuned Models:")
for model_name, result in tuned_results.items():
    print(f"\n{model_name}:")
    print(f"  Validation F1 Score: {result['val_f1_score']:.4f}")

# Find the best overall model
best_overall_model = max(tuned_results.items(), key=lambda x: x[1]['val_f1_score'])
print(f"\nüèÜ Best Overall Model: {best_overall_model[0]}")
print(f"   F1 Score: {best_overall_model[1]['val_f1_score']:.4f}")


STARTING HYPERPARAMETER TUNING WITH MLFLOW TRACKING

View live results at: http://localhost:5000

Tuning RandomForest...
Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   5.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   6.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   6.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   6.1s
[CV] END max_depth=None, m




‚úì Best CV F1 Score: 0.7580
‚úì Validation Accuracy: 0.7774
‚úì Validation F1 Score: 0.7596
‚úì Training Time: 15.02 seconds

Best Parameters:
  ‚Ä¢ n_estimators: 500
  ‚Ä¢ min_samples_split: 10
  ‚Ä¢ min_samples_leaf: 1
  ‚Ä¢ max_features: sqrt
  ‚Ä¢ max_depth: None

Tuning GradientBoosting...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.5s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.5s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.5s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.5s
[CV] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples




‚úì Best CV F1 Score: 0.7720
‚úì Validation Accuracy: 0.7627
‚úì Validation F1 Score: 0.7567
‚úì Training Time: 7.99 seconds

Best Parameters:
  ‚Ä¢ n_estimators: 400
  ‚Ä¢ min_samples_split: 2
  ‚Ä¢ min_samples_leaf: 1
  ‚Ä¢ max_features: sqrt
  ‚Ä¢ max_depth: 4
  ‚Ä¢ learning_rate: 0.1

HYPERPARAMETER TUNING COMPLETED!

Summary of Tuned Models:

RandomForest:
  Validation F1 Score: 0.7596

GradientBoosting:
  Validation F1 Score: 0.7567

üèÜ Best Overall Model: RandomForest
   F1 Score: 0.7596


In [38]:
import joblib
import os
from datetime import datetime

# Create a directory to save models
models_dir = 'saved_models'
os.makedirs(models_dir, exist_ok=True)

# Optional: Create a timestamped subdirectory for this training run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
run_dir = os.path.join(models_dir, f'run_{timestamp}')
os.makedirs(run_dir, exist_ok=True)

print("Saving models...\n")
print("="*70)

# Save each trained model
for model_name, model_data in results.items():
    model = model_data['model']
    accuracy = model_data['accuracy']
    
    # Create filename with accuracy for easy reference
    model_filename = f"{model_name}_acc{accuracy:.4f}.pkl"
    model_path = os.path.join(run_dir, model_filename)
    
    # Save the model
    joblib.dump(model, model_path)
    
    print(f"‚úì Saved {model_name} to {model_path}")

# Also save the preprocessor (very important for deployment!)
preprocessor_path = os.path.join(run_dir, 'preprocessor.pkl')
joblib.dump(preprocessor, preprocessor_path)
print(f"\n‚úì Saved preprocessor to {preprocessor_path}")

# Save the label encoder (needed to decode predictions back to class names)
label_encoder_path = os.path.join(run_dir, 'label_encoder.pkl')
joblib.dump(le, label_encoder_path)
print(f"‚úì Saved label encoder to {label_encoder_path}")

print("\n" + "="*70)
print(f"All models saved successfully in: {run_dir}")
print("="*70)

Saving models...

‚úì Saved RandomForest to saved_models/run_20251111_035749/RandomForest_acc0.7695.pkl
‚úì Saved LogisticRegression to saved_models/run_20251111_035749/LogisticRegression_acc0.7718.pkl
‚úì Saved SVC to saved_models/run_20251111_035749/SVC_acc0.7582.pkl
‚úì Saved DecisionTree to saved_models/run_20251111_035749/DecisionTree_acc0.7062.pkl
‚úì Saved GradientBoosting to saved_models/run_20251111_035749/GradientBoosting_acc0.7605.pkl
‚úì Saved AdaBoost to saved_models/run_20251111_035749/AdaBoost_acc0.7503.pkl
‚úì Saved KNN to saved_models/run_20251111_035749/KNN_acc0.6678.pkl
‚úì Saved GaussianNB to saved_models/run_20251111_035749/GaussianNB_acc0.6599.pkl

‚úì Saved preprocessor to saved_models/run_20251111_035749/preprocessor.pkl
‚úì Saved label encoder to saved_models/run_20251111_035749/label_encoder.pkl

All models saved successfully in: saved_models/run_20251111_035749
