In [1]:
# 02_Advanced_Data_Preprocessing.ipynb

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('../data/StudentsPerformance.csv')

In [None]:
# 1. Feature Engineering: Create a new target variable
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Check columns in X
print("Columns in X:", df.columns.tolist())

# If you want to keep all features except the target, drop only 'average score'
X = df.drop(['average score'], axis=1)
y = df['average score']

In [6]:
# 2. Advanced Preprocessing with Scikit-learn Pipelines
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

# Create a preprocessor pipeline
preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
# Use ColumnTransformer to apply the preprocessor to only the categorical columns
full_pipeline = ColumnTransformer(
    transformers=[
        ('categorical_preprocessing', preprocessor, categorical_features)
    ],
    remainder='passthrough'
)

In [8]:
# Fit and transform the data
X_processed = full_pipeline.fit_transform(X)

In [None]:
# 3. Save the preprocessed data and the pipeline for later use
import numpy as np

# Get feature names
ohe_feature_names = full_pipeline.named_transformers_['categorical_preprocessing'].named_steps['onehot'].get_feature_names_out(categorical_features)
passthrough_features = [col for col in X.columns if col not in categorical_features]
all_feature_names = np.concatenate([ohe_feature_names, passthrough_features])

# Create DataFrame
processed_df = pd.DataFrame(X_processed, columns=all_feature_names)
processed_df['average_score'] = y

processed_df.to_csv('../data/preprocessed_data.csv', index=False)
joblib.dump(full_pipeline, '../models/preprocessor_pipeline.pkl')

print("Data preprocessing complete. Preprocessed data and pipeline saved.")
print(processed_df.head())

ValueError: Shape of passed values is (1000, 1), indices imply (1000, 17)