In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
df = pd.read_csv("student_performance.csv")
print("Original Dataset:\n", df)
X = df[['Gender', 'StudyHours', 'Attendance']].copy()
y = df['Passed']
gender_encoder = LabelEncoder()
X['Gender'] = gender_encoder.fit_transform(X['Gender'])
numeric_features = ['StudyHours', 'Attendance']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)
X_numeric_processed = preprocessor.fit_transform(X)
X_final = np.hstack((X[['Gender']].values, X_numeric_processed))
X_final = pd.DataFrame(X_final, columns=['Gender', 'StudyHours', 'Attendance'])
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.25, random_state=42)
print("\nProcessed Training Features:\n", X_train)
print("\nProcessed Training Labels:\n", y_train)


Original Dataset:
       Name  Gender  StudyHours  Attendance Passed
0    Alice  Female        15.0          90    Yes
1      Bob    Male        10.0          80     No
2  Charlie    Male         NaN          70     No
3    David    Male         8.0          60     No
4      Eve  Female        12.0          85    Yes
5    Frank    Male         NaN          75     No
6    Grace  Female        14.0          95    Yes
7    Heidi  Female        11.0          88    Yes

Processed Training Features:
    Gender  StudyHours  Attendance
0     0.0    1.632993    0.888942
7     0.0   -0.326599    0.704226
2     1.0    0.000000   -0.958210
4     0.0    0.163299    0.427154
3     1.0   -1.796292   -1.881786
6     0.0    1.143095    1.350730

Processed Training Labels:
 [1 1 0 1 0 1]
