In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Load and prepare data
spf = pd.read_csv('StudentPerformanceFactors.csv')
spf_dropped = spf.dropna(axis=0, how='any')
X = spf_dropped.drop(columns=['Exam_Score'])
y = spf_dropped['Exam_Score']

# Define features
numerical_features = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 
                     'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity']
                      
categorical_features = ['Parental_Involvement', 'Access_to_Resources',
                       'Extracurricular_Activities', 'Motivation_Level',
                       'Internet_Access', 'Family_Income', 'Teacher_Quality',
                       'School_Type', 'Peer_Influence', 'Learning_Disabilities',
                       'Parental_Education_Level', 'Distance_from_Home', 'Gender']

# Ordinal features with their categories
ordinal_features = {
    'Parental_Involvement': ['Low', 'Medium', 'High'],
    'Access_to_Resources': ['Low', 'Medium', 'High'],
    'Motivation_Level': ['Low', 'Medium', 'High'],
    'Family_Income': ['Low', 'Medium', 'High'],
    'Teacher_Quality': ['Low', 'Medium', 'High'],
    'Peer_Influence': ['Negative', 'Neutral', 'Positive'],
    'Parental_Education_Level': ['High School', 'College', 'Postgraduate'],
    'Distance_from_Home': ['Near', 'Moderate', 'Far']
}

# Make a copy of X for encoding
X_encoded = X.copy()

# Create and store ordinal encoders for each ordinal feature
ordinal_encoders = {}
for feature, categories in ordinal_features.items():
    encoder = OrdinalEncoder(categories=[categories])
    X_encoded[feature] = encoder.fit_transform(X_encoded[[feature]])
    ordinal_encoders[feature] = encoder

# Remaining categorical features (for one-hot encoding)
remaining_categoricals = [f for f in categorical_features if f not in ordinal_features]

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), remaining_categoricals),
        ('ord', 'passthrough', list(ordinal_features.keys()))
    ]
)

# Encode y if needed
if y.dtype == object:
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
else:
    y_encoded = y

# Fit the preprocessor and transform X
X_processed = preprocessor.fit_transform(X_encoded)

# Train model
model = DecisionTreeClassifier()
model.fit(X_processed, y_encoded)

# Create sample data
sample_data = {
    'Hours_Studied': [10, 11],
    'Attendance': [90, 85],
    'Parental_Involvement': ['Medium', 'High'],
    'Access_to_Resources': ['High', 'Medium'],
    'Extracurricular_Activities': ['Yes', 'No'],
    'Sleep_Hours': [8, 7],
    'Previous_Scores': [75, 80],
    'Motivation_Level': ['High', 'Medium'],
    'Internet_Access': ['Yes', 'Yes'],
    'Tutoring_Sessions': [3, 1],
    'Family_Income': ['Medium', 'High'],
    'Teacher_Quality': ['High', 'High'],
    'School_Type': ['Public', 'Private'],
    'Peer_Influence': ['Positive', 'Neutral'],
    'Physical_Activity': [3, 2],
    'Learning_Disabilities': ['No', 'No'],
    'Parental_Education_Level': ['College', 'Postgraduate'],
    'Distance_from_Home': ['Moderate', 'Near'],
    'Gender': ['Male', 'Female']
}

sample = pd.DataFrame(sample_data)

# Preprocess sample the same way as training data
sample_encoded = sample.copy()

# Use the same ordinal encoders that were fitted on training data
for feature, encoder in ordinal_encoders.items():
    sample_encoded[feature] = encoder.transform(sample_encoded[[feature]])

sample_processed = preprocessor.transform(sample_encoded)

# Make predictions
predictions_encoded = model.predict(sample_processed)

# Convert back to original labels if needed
if y.dtype == object:
    predictions = le.inverse_transform(predictions_encoded)
else:
    predictions = predictions_encoded

print(predictions)

[65 66]


1. Data Loading and Preparation

In [None]:
spf = pd.read_csv('StudentPerformanceFactors.csv')
spf_dropped = spf.dropna(axis=0, how='any')
X = spf_dropped.drop(columns=['Exam_Score'])
y = spf_dropped['Exam_Score']

Loads student performance data from a CSV file

Drops rows with any missing values

Separates features (X) from target variable (y = Exam_Score)

2. Feature Definition

In [None]:
numerical_features = [...] # List of numerical columns
categorical_features = [...] # List of categorical columns

Defines which columns contain numerical vs categorical data

Numerical features are continuous values (hours, scores, counts)

Categorical features are discrete values (categories, yes/no, levels)

3. Ordinal Feature Encoding

In [None]:
ordinal_features = {...} # Dictionary mapping features to their ordered categories
ordinal_encoders = {}
for feature, categories in ordinal_features.items():
    encoder = OrdinalEncoder(categories=[categories])
    X_encoded[feature] = encoder.fit_transform(X_encoded[[feature]])
    ordinal_encoders[feature] = encoder

Identifies ordinal categorical features (those with inherent order like Low/Medium/High)

Creates and fits an OrdinalEncoder for each, storing the encoders for later use

Transforms the ordinal features into numerical values (e.g., Low=0, Medium=1, High=2)

4. Preprocessing Pipeline

In [None]:
remaining_categoricals = [...] # Non-ordinal categorical features
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_features),
    ('cat', OneHotEncoder(), remaining_categoricals),
    ('ord', 'passthrough', ordinal_features.keys())
])

Creates a preprocessing pipeline that:

Passes through numerical features unchanged

Applies OneHotEncoding to nominal categorical features (no inherent order)

Passes through already-encoded ordinal features

5. Target Variable Encoding

In [None]:
if y.dtype == object:
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

Encodes the target variable if it's categorical (converts strings to numbers)

6. Model Training

In [None]:
X_processed = preprocessor.fit_transform(X_encoded)
model = DecisionTreeClassifier()
model.fit(X_processed, y_encoded)

Applies all preprocessing steps to the training data

Trains a Decision Tree classifier on the processed data

7. Making Predictions on New Data

In [None]:
sample_data = {...} # New student data to predict
sample = pd.DataFrame(sample_data)
sample_encoded = sample.copy()
for feature, encoder in ordinal_encoders.items():
    sample_encoded[feature] = encoder.transform(sample_encoded[[feature]])
sample_processed = preprocessor.transform(sample_encoded)
predictions_encoded = model.predict(sample_processed)

Creates sample data for prediction

Applies the same preprocessing steps used on training data:

Uses the stored ordinal encoders

Applies the same column transformations

Makes predictions using the trained model

8. Output Results

In [None]:
if y.dtype == object:
    predictions = le.inverse_transform(predictions_encoded)
print(predictions)

If target was originally categorical, converts predictions back to original labels

Prints the final predictions

Key Concepts:
Feature Engineering: Properly handling different data types (numerical, ordinal categorical, nominal categorical)

Pipeline: Creating a reproducible preprocessing workflow that works the same way on training and new data

Model Training: Using a Decision Tree classifier to learn patterns in the data

Prediction: Applying the same transformations to new data before making predictions

The code demonstrates a complete ML workflow from raw data to predictions while properly handling all data types and ensuring consistent preprocessing between training and prediction phases.