# Student Classification & Financial Assistance Analysis - V4
## Objectives:
1. **Performance Prediction**: Classify students into 'Weak', 'Average', or 'Good' using only non-leaky features (excluding CGPA and Attendance).
2. **Financial Assistance Prediction**: Classify students needing assistance based on weighted demographic factors.
3. **Interpretability**: Analyze feature weightages for both models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score

import warnings
warnings.filterwarnings('ignore')
print('Libraries imported successfully!')

In [None]:
df = pd.read_csv('dataset.csv')
print(f'Dataset Shape: {df.shape}')
# Drop Student_ID
if 'Student_ID' in df.columns: df = df.drop(columns=['Student_ID'])
df.head()

### Task 1: Student Performance Classification
We remove `CGPA` and `Attendance_Percentage` because they are essentially proxies for the target label (`Performance_Class`), leading to unrealistic 1.0 accuracy.

In [None]:
# Define features and target
X1 = df.drop(columns=['Performance_Class', 'CGPA', 'Attendance_Percentage'])
y1 = LabelEncoder().fit_transform(df['Performance_Class'])

numeric_features1 = X1.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features1 = X1.select_dtypes(include=['object']).columns.tolist()

preprocessor1 = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_features1),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features1)
])

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

clf1 = Pipeline([('preprocessor', preprocessor1), ('classifier', RandomForestClassifier(random_state=42))])
clf1.fit(X_train1, y_train1)

pred1 = clf1.predict(X_test1)
print('Task 1 Performance (Robust Selection):')
print(classification_report(y_test1, pred1))
print(f'Accuracy: {accuracy_score(y_test1, pred1):.4f}')

### Task 2: Financial Assistance Need Prediction
We synthesize the target variable `Assistance_Needed` based on weighted demographic factors such as income, internet access, and availability of devices.

In [None]:
# Define Weightages for labels
weights = {
    'Family_Income_PKR': {'Low (<30k)': 3, 'Lower-Middle (30k-60k)': 2, 'Middle (60k-120k)': 1},
    'Internet_Access': {'No': 1},
    'Device_Available': {'None': 2, 'Mobile': 1},
    'Electricity_Availability': {'Frequent Outages': 1},
    'Part_Time_Job': {'Yes': 1}
}

def calculate_need(row):
    score = 0
    score += weights['Family_Income_PKR'].get(row['Family_Income_PKR'], 0)
    if row['Internet_Access'] == 'No': score += weights['Internet_Access']['No']
    score += weights['Device_Available'].get(row['Device_Available'], 0)
    if row['Electricity_Availability'] == 'Frequent Outages': score += weights['Electricity_Availability']['Frequent Outages']
    if row['Part_Time_Job'] == 'Yes': score += weights['Part_Time_Job']['Yes']
    return score

df['Need_Score'] = df.apply(calculate_need, axis=1)
df['Assistance_Needed'] = df['Need_Score'].apply(lambda x: 1 if x >= 4 else 0)

print(f"Assistance distribution:\n{df['Assistance_Needed'].value_counts()}")

In [None]:
# Model Task 2
X2 = df.drop(columns=['Assistance_Needed', 'Need_Score', 'Performance_Class', 'CGPA'])
y2 = df['Assistance_Needed']

numeric_features2 = X2.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features2 = X2.select_dtypes(include=['object']).columns.tolist()

preprocessor2 = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_features2),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features2)
])

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)

clf2 = Pipeline([('preprocessor', preprocessor2), ('classifier', LogisticRegression())])
clf2.fit(X_train2, y_train2)

pred2 = clf2.predict(X_test2)
print('Task 2 Results (Logistic Regression):')
print(classification_report(y_test2, pred2))
print(f'Accuracy: {accuracy_score(y_test2, pred2):.4f}')

### Feature Weightages in Models
Below we visualize which demographic features weigh most in predicting the need for assistance.

In [None]:
# Extract coefficients from Logistic Regression
coefs = clf2.named_steps['classifier'].coef_[0]
cat_columns = clf2.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features2)
feature_names = numeric_features2 + list(cat_columns)

importance_df = pd.DataFrame({'Feature': feature_names, 'Weightage': coefs})
importance_df = importance_df.sort_values(by='Weightage', ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(x='Weightage', y='Feature', data=importance_df, palette='magma')
plt.title('Top Feature Weightages for Financial Assistance Need')
plt.show()