<a href="https://colab.research.google.com/github/Junie254/Student-Enrollment-prediction/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
import pickle

# Step 1: Generate synthetic dataset (simulate student data)
np.random.seed(42)

# Simulate 1000 rows of student data
data = pd.DataFrame({
    'GPA': np.random.uniform(2.0, 4.0, 1000),  # GPA range between 2.0 and 4.0
    'Attendance_Rate': np.random.uniform(50, 100, 1000),  # Attendance rate in percentage
    'Parental_Education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
    'Socioeconomic_Status': np.random.choice(['Low', 'Middle', 'High'], 1000),
    'Test_Scores': np.random.uniform(60, 100, 1000),  # Test scores out of 100
    'Enrolled': np.random.choice([0, 1], 1000, p=[0.4, 0.6])  # Binary target variable (enrolled or not)
})

# Step 2: Preprocess the data
# Encode categorical variables
label_encoders = {}
for col in ['Parental_Education', 'Socioeconomic_Status']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
X = data.drop('Enrolled', axis=1)
y = data['Enrolled']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 3: Create and train the model pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the data
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Step 4: Evaluate the model
y_pred = best_model.predict(X_test)
evaluation_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "ROC-AUC": roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
}

# Display evaluation metrics
print("Evaluation Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value:.4f}")

# Step 5: Save the model
model_path = "student_enrollment_model.pkl"  # Save locally
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Model saved at {model_path}")

# Optional: Save the dataset as a CSV file for reference
data_path = "student_data.csv"
data.to_csv(data_path, index=False)
print(f"Dataset saved at {data_path}")


Evaluation Metrics:
Accuracy: 0.5733
Precision: 0.6109
Recall: 0.7627
F1 Score: 0.6784
ROC-AUC: 0.4835
Model saved at student_enrollment_model.pkl
Dataset saved at student_data.csv
