In [2]:
# ATTENDANCE PREDICTION MODEL
# Predicts current semester attendance using previous semester records

# --- 1. Import required libraries ---

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import random

# --- 2. Generate synthetic dataset ---

num_students = 1200   # number of students
semesters = 8         # total semesters

# Generate random USNs (Unique Student Numbers)

batches = ['AIML', 'CSE', 'ECE', 'MECH']
usn_list = [
    # [CORRECTION 1] zfill(3) changed to zfill(4) for 1200 students
    f"{random.choice(batches)}{str(2000 + random.randint(20, 25))}{str(i).zfill(4)}"
    for i in range(num_students)
]

# Generate attendance data and CGPA for each semester

data = []
for usn in usn_list:
    # Introduce different student profiles
    profile = random.choice(['average', 'intellectual', 'struggling_good_att', 'failure'])

    if profile == 'average':
        base_att = np.random.normal(85, 7)
        base_cgp = np.random.normal(8.5, 0.8)
    elif profile == 'intellectual':
        base_att = np.random.normal(90, 5) # Higher attendance
        base_cgp = np.random.normal(9.0, 0.5) # Higher CGPA
    elif profile == 'struggling_good_att':
        base_att = np.random.normal(90, 5) # Higher attendance
        base_cgp = np.random.normal(7.0, 0.8) # Lower CGPA
    elif profile == 'failure':
        base_att = np.random.normal(60, 10) # Lower attendance
        base_cgp = np.random.normal(6.5, 1.0) # Lower CGPA

    sem_values = []
    cgp_values = []
    for sem in range(1, semesters + 1):
        variation_att = np.random.normal(0, 4)  # random noise per semester for attendance
        variation_cgp = np.random.normal(0, 0.3) # random noise per semester for CGPA
        sem_value = min(max(base_att + variation_att + (sem * 0.2), 50), 100)
        cgp_value = min(max(base_cgp + variation_cgp - (sem * 0.05), 0.0), 10.0) # CGPA slightly decreases over semesters
        sem_values.append(sem_value)
        cgp_values.append(cgp_value)
    data.append([usn] + sem_values + cgp_values)

# Create DataFrame

columns = ['USN'] + [f'Sem{i}_Att' for i in range(1, semesters + 1)] + [f'Sem{i}_CGP' for i in range(1, semesters + 1)]
df = pd.DataFrame(data, columns=columns)

# --- 3. Save dataset (optional) ---

df.to_csv("synthetic_attendance_dataset.csv", index=False)
print("✅ Dataset generated and saved as 'synthetic_attendance_dataset.csv'")
print(df.head(10)) # Display first 10 rows

# --- 4. Get user input for prediction semester ---
while True:
    try:
        predict_sem = int(input(f"\n➡️ Enter the semester you want to predict attendance for (2-{semesters}): "))
        if 2 <= predict_sem <= semesters:
            break
        else:
            print(f"Please enter a valid semester between 2 and {semesters}.")
    except ValueError:
        print("Invalid input. Please enter a number.")


print(f"\n➡️ Using semester {predict_sem} for attendance prediction.")


# --- 5. Define features and target based on user input ---

# Use previous semesters' attendance and CGPA to predict the chosen semester's attendance
feature_cols = [f'Sem{i}_Att' for i in range(1, predict_sem)] + [f'Sem{i}_CGP' for i in range(1, predict_sem)]
target_col = f'Sem{predict_sem}_Att'

if not feature_cols:
    print(f"Cannot predict Sem{predict_sem}. Please choose a semester from 2 onwards.")
else:
    X = df[feature_cols]
    y = df[target_col]

    # --- 6. Split into training and test sets ---

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )

    # --- 7. Train Linear Regression Model ---

    lin_model = LinearRegression()
    lin_model.fit(X_train, y_train)
    y_pred_lin = lin_model.predict(X_test)

    # --- 8. Evaluation Function ---

    def evaluate_model(name, y_true, y_pred):
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        print(f"\n📊 {name} Performance")
        print(f"RMSE: {rmse:.2f}")
        print(f"R² Score: {r2:.4f}")
        print("-" * 40)

    evaluate_model("Linear Regression", y_test, y_pred_lin)

    # --- 9. User Input and Prediction (Linear Regression) ---

    print(f"\n➡️ Enter attendance percentages and CGPA for Semesters 1-{predict_sem-1}:")
    user_input = {}
    for sem in range(1, predict_sem):
        while True:
            try:
                percentage = float(input(f"Sem{sem}_Att: "))
                if 0 <= percentage <= 100:
                    user_input[f'Sem{sem}_Att'] = percentage
                    break
                else:
                    print("Please enter a valid percentage between 0 and 100.")
            except ValueError:
                print("Invalid input. Please enter a number.")
        while True:
            try:
                cgp = float(input(f"Sem{sem}_CGP: "))
                if 0 <= cgp <= 10:
                    user_input[f'Sem{sem}_CGP'] = cgp
                    break
                else:
                    print("Please enter a valid CGPA between 0 and 10.")
            except ValueError:
                print("Invalid input. Please enter a number.")


    user_df = pd.DataFrame([user_input])

    # [CORRECTION 2] Re-order columns to match the training data
    user_df = user_df[feature_cols]

    # Predict using the Linear Regression model
    predicted_sem = lin_model.predict(user_df)

    print(f"\n🔮 Predicted Attendance for Sem{predict_sem} (Linear Regression): {predicted_sem[0]:.2f}%")

✅ Dataset generated and saved as 'synthetic_attendance_dataset.csv'
            USN    Sem1_Att   Sem2_Att   Sem3_Att   Sem4_Att    Sem5_Att  \
0  MECH20220000   87.772812  79.149687  86.485190  82.678487   80.924408   
1   CSE20240001   89.260331  95.299653  96.508317  90.921951   95.241832   
2  AIML20200002   85.385416  90.417829  84.505209  92.295103   89.890328   
3   CSE20220003   50.448616  53.019886  50.481805  51.465124   53.181714   
4  AIML20220004   95.303943  93.612488  88.516656  91.389057   96.533230   
5  MECH20220005   92.852052  86.552208  84.661644  94.315693   98.192867   
6   CSE20250006   89.823017  89.994229  88.173154  89.709597   86.131773   
7  MECH20200007   97.143453  91.615367  94.178515  99.103317   87.783695   
8   ECE20250008  100.000000  98.333161  98.609210  93.874130  100.000000   
9   CSE20240009   86.638074  86.969437  77.451269  82.383634   89.019596   

     Sem6_Att   Sem7_Att    Sem8_Att  Sem1_CGP  Sem2_CGP  Sem3_CGP  Sem4_CGP  \
0   80.460501  