In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

def generate_synthetic_student_data(num_students, output_filename="student_performance_data.csv"):
    print(f"Generating {num_students} synthetic student records...")

    student_ids = [f"STU{i+1:04d}" for i in range(num_students)]
    genders = random.choices(['Male', 'Female', 'Other'], k=num_students)
    ages = np.random.randint(16, 23, num_students)
    majors = random.choices(['Computer Science', 'Electrical Engineering', 'Mechanical Engineering',
                             'Civil Engineering', 'Mathematics', 'Physics', 'Chemistry',
                             'Biology', 'English', 'History', 'Economics', 'Psychology'], k=num_students)

    math_grades = np.random.normal(loc=75, scale=10, size=num_students).clip(0, 100)
    science_grades = np.random.normal(loc=70, scale=12, size=num_students).clip(0, 100)
    english_grades = np.random.normal(loc=80, scale=8, size=num_students).clip(0, 100)

    attendance_percentages = np.random.uniform(60, 100, num_students)
    study_hours_per_week = np.random.uniform(5, 30, num_students)

    avg_grades = (math_grades + science_grades + english_grades) / 3
    pass_status = ['Pass' if avg >= 60 + random.uniform(-5, 5) else 'Fail' for avg in avg_grades]

    start_date = datetime(2020, 1, 1)
    end_date = datetime(2024, 12, 31)
    time_diff = (end_date - start_date).days
    enrollment_dates = [start_date + timedelta(days=random.randint(0, time_diff)) for _ in range(num_students)]
    enrollment_dates_str = [date.strftime('%Y-%m-%d') for date in enrollment_dates]

    data = {
        'StudentID': student_ids,
        'Gender': genders,
        'Age': ages,
        'Major': majors,
        'MathGrade': math_grades.round(2),
        'ScienceGrade': science_grades.round(2),
        'EnglishGrade': english_grades.round(2),
        'AttendancePercentage': attendance_percentages.round(2),
        'StudyHoursPerWeek': study_hours_per_week.round(1),
        'OverallStatus': pass_status,
        'EnrollmentDate': enrollment_dates_str
    }

    df = pd.DataFrame(data)
    df.to_csv(output_filename, index=False)
    print(f"Synthetic student performance data generated and saved to '{output_filename}'")
    print("-" * 50)


def run_ml_project(data_filename="student_performance_data.csv"):
    print(f"Starting ML project using data from '{data_filename}'...")

    try:
        df = pd.read_csv(data_filename)
        print("Dataset loaded successfully.")
        print("First 5 rows of the dataset:")
        print(df.head())
        print("\nDataset info:")
        df.info()
        print("-" * 50)
    except FileNotFoundError:
        print(f"Error: The file '{data_filename}' was not found. Please ensure it's generated first.")
        return

    print("Starting data preprocessing...")

    df = df.drop(['StudentID', 'EnrollmentDate'], axis=1)

    categorical_features = ['Gender', 'Major']
    numerical_features = ['Age', 'MathGrade', 'ScienceGrade', 'EnglishGrade',
                          'AttendancePercentage', 'StudyHoursPerWeek']

    df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)
    print("\nDataFrame after One-Hot Encoding categorical features:")
    print(df_encoded.head())
    print("-" * 50)

    X = df_encoded.drop('OverallStatus', axis=1)
    y = df_encoded['OverallStatus']

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    print(f"Target variable mapping: {list(label_encoder.classes_)} -> {label_encoder.transform(label_encoder.classes_)}")
    print("-" * 50)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    print(f"Data split into training and testing sets:")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    print("-" * 50)

    print("Training the Random Forest Classifier model...")
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

    model.fit(X_train, y_train)
    print("Model training complete.")
    print("-" * 50)

    print("Evaluating the model performance...")
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_))
    print("-" * 50)

    print("Analyzing Feature Importances (which features influenced the prediction most):")
    feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    print(feature_importances)
    print("-" * 50)

    print("Demonstrating prediction on a new, hypothetical student...")

    new_student_data = {
        'Age': 19,
        'MathGrade': 85.0,
        'ScienceGrade': 78.0,
        'EnglishGrade': 92.0,
        'AttendancePercentage': 95.0,
        'StudyHoursPerWeek': 25.0,
        **{col: 0 for col in X.columns if col.startswith(('Gender_', 'Major_'))}
    }
  
    new_student_data['Gender_Male'] = 1
    new_student_data['Major_Computer Science'] = 1 

    new_student_df = pd.DataFrame([new_student_data])
    new_student_df = new_student_df[X.columns]

    new_student_prediction_encoded = model.predict(new_student_df)
    new_student_prediction = label_encoder.inverse_transform(new_student_prediction_encoded)

    print(f"\nHypothetical Student Data:\n{new_student_df.to_string(index=False)}")
    print(f"\nPredicted Performance: {new_student_prediction[0]}")
    print("-" * 50)

if __name__ == "__main__":
    num_students_to_generate = 5000
    data_file = "student_performance_data.csv"

    generate_synthetic_student_data(num_students=num_students_to_generate, output_filename=data_file)

    run_ml_project(data_filename=data_file)



Generating 5000 synthetic student records...
Synthetic student performance data generated and saved to 'student_performance_data.csv'
--------------------------------------------------
Starting ML project using data from 'student_performance_data.csv'...
Dataset loaded successfully.
First 5 rows of the dataset:
  StudentID  Gender  Age                   Major  MathGrade  ScienceGrade  \
0   STU0001  Female   22  Electrical Engineering      71.43         80.24   
1   STU0002  Female   19  Mechanical Engineering      68.73         67.80   
2   STU0003   Other   21              Psychology      59.80         83.16   
3   STU0004    Male   16             Mathematics      80.24         85.02   
4   STU0005    Male   18             Mathematics     100.00         72.61   

   EnglishGrade  AttendancePercentage  StudyHoursPerWeek OverallStatus  \
0         70.62                 92.06               28.3          Pass   
1         79.22                 60.79                9.3          Pass   
2 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
