# Academic Risk Prediction - Model Training

This notebook implements the machine learning pipeline for predicting academic risk using student data from the UMBC Neo4j graph database.

## Steps:
1. Load and explore the ML dataset
2. Split data into training and testing sets
3. Train baseline Logistic Regression model
4. Train LightGBM model
5. Evaluate and compare models
6. Save the best model


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"LightGBM version: {lgb.__version__}")


## Step 1: Load the ML Dataset

Load the `ml_data.csv` file that was created from the Neo4j graph database.


In [None]:
# Load the ML dataset
try:
    df = pd.read_csv('ml_data.csv')
    print(" Successfully loaded ml_data.csv")
    print(f" Dataset shape: {df.shape}")
    print(f" Columns: {list(df.columns)}")
    print("\n First few rows:")
    print(df.head())
    
    print("\n Dataset info:")
    print(df.info())
    
    print("\n Target variable distribution:")
    if 'academic_risk' in df.columns:
        print(df['academic_risk'].value_counts())
    else:
        print("'academic_risk' column not found. Available columns:")
        print(df.columns.tolist())
        
except FileNotFoundError:
    print("ml_data.csv not found!")
    print("Please make sure the file exists in the current directory.")
    print("You may need to run the data extraction notebook first.")
except Exception as e:
    print(f"Error loading data: {str(e)}")


## Step 2: Data Preprocessing and Train-Test Split

Prepare the data for machine learning by splitting into features and target, then split into 80% training and 20% testing sets.

In [None]:
# Data preprocessing and train-test split
def prepare_data(df):
    """
    Prepare data for machine learning
    """
    # Identify target variable (assuming it's 'academic_risk' or similar)
    target_columns = ['academic_risk', 'risk_level', 'at_risk', 'target']
    target_col = None
    
    for col in target_columns:
        if col in df.columns:
            target_col = col
            break
    
    if target_col is None:
        print("No target column found. Please specify the target column name.")
        return None, None, None, None
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Handle categorical variables (if any)
    categorical_columns = X.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        print(f" Found categorical columns: {list(categorical_columns)}")
        # For now, we'll use label encoding (you can improve this later)
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        for col in categorical_columns:
            X[col] = le.fit_transform(X[col].astype(str))
    
    # Split the data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f" Data split completed:")
    print(f"   Training set: {X_train.shape[0]} samples")
    print(f"   Testing set: {X_test.shape[0]} samples")
    print(f"   Features: {X_train.shape[1]}")
    print(f"   Target distribution in training set:")
    print(f"   {y_train.value_counts()}")
    
    return X_train, X_test, y_train, y_test

# Prepare the data
X_train, X_test, y_train, y_test = prepare_data(df)


## Step 3: Train Baseline Logistic Regression Model

Train a baseline Logistic Regression model and evaluate its F1-score on the test set.


In [None]:
# Train baseline Logistic Regression model
def train_baseline_model(X_train, X_test, y_train, y_test):
    """
    Train and evaluate a baseline Logistic Regression model
    """
    print("🚀 Training baseline Logistic Regression model...")
    
    # Initialize and train the model
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_lr = lr_model.predict(X_test)
    
    # Calculate metrics
    f1_lr = f1_score(y_test, y_pred_lr, average='weighted')
    precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
    recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
    
    print(" Baseline Logistic Regression Results:")
    print(f"   F1-Score: {f1_lr:.4f}")
    print(f"   Precision: {precision_lr:.4f}")
    print(f"   Recall: {recall_lr:.4f}")
    
    # Display classification report
    print("\n Classification Report:")
    print(classification_report(y_test, y_pred_lr))
    
    # Display confusion matrix
    print("\n Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred_lr)
    print(cm)
    
    return lr_model, f1_lr

# Train the baseline model
if X_train is not None:
    lr_model, baseline_f1 = train_baseline_model(X_train, X_test, y_train, y_test)
else:
    print(" Cannot train model - data preparation failed")


## Step 4: Train LightGBM Model

Train the main LightGBM model on the same training data and evaluate its performance.


In [None]:
# Train LightGBM model
def train_lightgbm_model(X_train, X_test, y_train, y_test):
    """
    Train and evaluate a LightGBM model
    """
    print(" Training LightGBM model...")
    
    # LightGBM parameters
    lgb_params = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y_train)),
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': 42
    }
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    # Train the model
    lgb_model = lgb.train(
        lgb_params,
        train_data,
        valid_sets=[test_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(0)]
    )
    
    # Make predictions
    y_pred_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    y_pred_lgb = np.argmax(y_pred_lgb, axis=1)
    
    # Calculate metrics
    f1_lgb = f1_score(y_test, y_pred_lgb, average='weighted')
    precision_lgb = precision_score(y_test, y_pred_lgb, average='weighted')
    recall_lgb = recall_score(y_test, y_pred_lgb, average='weighted')
    
    print(" LightGBM Results:")
    print(f"   F1-Score: {f1_lgb:.4f}")
    print(f"   Precision: {precision_lgb:.4f}")
    print(f"   Recall: {recall_lgb:.4f}")
    print(f"   Best iteration: {lgb_model.best_iteration}")
    
    # Display classification report
    print("\n Classification Report:")
    print(classification_report(y_test, y_pred_lgb))
    
    # Display confusion matrix
    print("\n Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred_lgb)
    print(cm)
    
    return lgb_model, f1_lgb, precision_lgb, recall_lgb

# Train the LightGBM model
if X_train is not None:
    lgb_model, lgb_f1, lgb_precision, lgb_recall = train_lightgbm_model(X_train, X_test, y_train, y_test)
else:
    print("Cannot train model - data preparation failed")


## Step 5: Model Comparison and Feature Importance

Compare the performance of both models and analyze feature importance.


In [None]:
# Model comparison and feature importance
def compare_models(baseline_f1, lgb_f1, lgb_precision, lgb_recall):
    """
    Compare model performance and display results
    """
    print("Model Comparison Results:")
    print("=" * 50)
    print(f"Baseline Logistic Regression F1-Score: {baseline_f1:.4f}")
    print(f"LightGBM F1-Score:                    {lgb_f1:.4f}")
    print(f"LightGBM Precision:                   {lgb_precision:.4f}")
    print(f"LightGBM Recall:                      {lgb_recall:.4f}")
    print("=" * 50)
    
    improvement = lgb_f1 - baseline_f1
    if improvement > 0:
        print(f"LightGBM improves F1-score by {improvement:.4f}")
    else:
        print(f"LightGBM F1-score is {abs(improvement):.4f} lower than baseline")

def plot_feature_importance(lgb_model, feature_names, top_n=20):
    """
    Plot feature importance from LightGBM model
    """
    # Get feature importance
    importance = lgb_model.feature_importance(importance_type='gain')
    
    # Create DataFrame
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    # Plot top features
    plt.figure(figsize=(10, 8))
    top_features = feature_importance_df.head(top_n)
    sns.barplot(data=top_features, x='importance', y='feature')
    plt.title(f'Top {top_n} Feature Importance (LightGBM)')
    plt.xlabel('Importance (Gain)')
    plt.tight_layout()
    plt.show()
    
    return feature_importance_df

# Compare models
if 'baseline_f1' in locals() and 'lgb_f1' in locals():
    compare_models(baseline_f1, lgb_f1, lgb_precision, lgb_recall)
    
    # Plot feature importance
    if X_train is not None:
        feature_importance_df = plot_feature_importance(lgb_model, X_train.columns)
        print("\n🔍 Top 10 Most Important Features:")
        print(feature_importance_df.head(10)[['feature', 'importance']])
else:
    print(" Cannot compare models - training failed")


## Step 6: Save the Trained LightGBM Model

Save the trained LightGBM model to a file named `academic_risk_model.joblib` using the joblib library.


In [None]:
# Save the trained LightGBM model using joblib
def save_model(lgb_model, filename='academic_risk_model.joblib'):
    """
    Save the trained LightGBM model using joblib
    """
    try:
        joblib.dump(lgb_model, filename)
        print(f" Model successfully saved to {filename}")
        
        # Verify the model can be loaded
        loaded_model = joblib.load(filename)
        print(f" Model verification successful - can be loaded from {filename}")
        
        return True
    except Exception as e:
        print(f" Error saving model: {str(e)}")
        return False

# Save the model
if 'lgb_model' in locals():
    save_model(lgb_model, 'academic_risk_model.joblib')
else:
    print(" No trained model found to save")
