In [1]:
# 02_Advanced_Data_Preprocessing.ipynb

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('../data/StudentsPerformance.csv')


In [3]:
# 1. Feature Engineering: Create a new target variable
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Define features (X) and target (y)
X = df.drop(['math score', 'reading score', 'writing score', 'average score'], axis=1)
y = df['average score']


In [4]:
# 2. Advanced Preprocessing with Scikit-learn Pipelines
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

# Create a preprocessor pipeline
preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [5]:
# Use ColumnTransformer to apply the preprocessor to only the categorical columns
full_pipeline = ColumnTransformer(
    transformers=[
        ('categorical_preprocessing', preprocessor, categorical_features)
    ],
    remainder='passthrough'
)


In [6]:
# Fit and transform the data
X_processed = full_pipeline.fit_transform(X)


In [8]:
# 3. Create a final DataFrame and save it
# We can't use get_feature_names_out() directly because of the remainder='passthrough'.
# The easiest way to get the correct number of columns is to use the transformed data's shape.
num_features = X_processed.shape[1]
processed_df = pd.DataFrame(X_processed.toarray(), columns=[f'feature_{i}' for i in range(num_features)])
processed_df['average_score'] = y

# Save the preprocessed data and the pipeline for later use
processed_df.to_csv('../data/preprocessed_data.csv', index=False)
joblib.dump(full_pipeline, '../models/preprocessor_pipeline.pkl')

print("Data preprocessing complete. Preprocessed data and pipeline saved.")
print(processed_df.head())

Data preprocessing complete. Preprocessed data and pipeline saved.
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0        1.0        0.0        0.0        1.0        0.0        0.0   
1        1.0        0.0        0.0        0.0        1.0        0.0   
2        1.0        0.0        0.0        1.0        0.0        0.0   
3        0.0        1.0        1.0        0.0        0.0        0.0   
4        0.0        1.0        0.0        0.0        1.0        0.0   

   feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
0        0.0        0.0        1.0        0.0         0.0         0.0   
1        0.0        0.0        0.0        0.0         0.0         1.0   
2        0.0        0.0        0.0        0.0         1.0         0.0   
3        0.0        1.0        0.0        0.0         0.0         0.0   
4        0.0        0.0        0.0        0.0         0.0         1.0   

   feature_12  feature_13  feature_14  feature_15  feature_16  aver