In [19]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load data
data = pd.read_csv('../Data/your_dataset.csv')

# ---------------------------------
# 🔹 Data Cleaning
# ---------------------------------
# Remove duplicates if any
data.drop_duplicates(inplace=True)

# Remove rows with missing values (or you can choose to fill them)
data.dropna(inplace=True)

# Remove any obvious outliers (example: negative study time or impossible values)
data = data[(data['studytime'] >= 0) & (data['absences'] >= 0) & (data['failures'] >= 0)]

# ---------------------------------
# 🔹 Data Transformation
# ---------------------------------
# Cap extreme absences (optional)
data['absences'] = np.where(data['absences'] > 50, 50, data['absences'])

# ---------------------------------
# 🔹 Feature Engineering
# ---------------------------------
# Create average grade feature
data['avg_grade'] = (data['G1'] + data['G2']) / 2

# Binarize studytime (optional: low vs high study)
data['studytime_level'] = np.where(data['studytime'] < 2, 'Low', 'High')

# ---------------------------------
# 🔹 Encode Categorical Features
# ---------------------------------
label_encoder = LabelEncoder()

# Encode target variable 'pass_fail'
data['Pass_Fail'] = label_encoder.fit_transform(data['Pass_Fail'])

# Encode studytime_level (new feature)
data['studytime_level'] = label_encoder.fit_transform(data['studytime_level'])

# ---------------------------------
# 🔹 Feature Scaling (Transformation)
# ---------------------------------
scaler = StandardScaler()
numerical_features = ['studytime', 'failures', 'absences', 'G1', 'G2', 'avg_grade']

data[numerical_features] = scaler.fit_transform(data[numerical_features])

# ---------------------------------
# 🔹 Save Preprocessed Data
# ---------------------------------
data.to_csv('../Data/preprocessed_data.csv', index=False)

print("✅ Preprocessing completed. Cleaned & transformed data saved as preprocessed_data.csv")


✅ Preprocessing completed. Cleaned & transformed data saved as preprocessed_data.csv
