# What Is a Pipeline?
- A Pipeline allows you to chain multiple steps together:

- 🧼 Preprocessing (e.g., OneHotEncoding, scaling)
- 🔮 Model (e.g., DecisionTree, RandomForest, etc.)

So everything is done in one flow, making the code cleaner and more organized.

In [1]:
import pandas as pd
df = pd.read_csv("StudentsPerformance.csv")
df.head()

In [2]:
df.describe()

In [3]:
# Create average score and performance label
df['average_score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3
df['performance'] = df['average_score'].apply(lambda x: 'pass' if x >= 50 else 'fail')

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [5]:
# Define features and target
X = df.drop(columns=['average_score', 'performance'])
y = df['performance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['parental level of education', 'gender', 'race/ethnicity', 'lunch', 'test preparation course']),
        ('num', StandardScaler(), ['math score', 'reading score', 'writing score'])
    ]
)

In [7]:
# Full pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('scaling', MinMaxScaler()),
    ('model', DecisionTreeClassifier(random_state=42))
])

In [8]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [9]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

In [10]:
accuracy_score(y_test, y_pred)