### Goal
- Predict students' test scores.

### Evaluation
- Submissions are evaluated using the Root Mean Squared Error between the predicted and the observed target.

In [None]:
## Best Score is : 8.53472 (RMSE)

## Importing Libraries

## TODO:
1. Study Hours doesnt make any sense with Exam Score. Need to investigate more.
2. Convert all categorical columns into numerical using OrdinalEncoder or OneHotEncoder. and check correlation and build scatter plots.

#### TODO: possibly we need create a manual mapping for ordinal features
- Feature facility_rating - low \ medium \ high can be mapped to 0, 1, 2
- Feature exam_difficulty - easy \ medium \ hard can be mapped to 0, 1, 2
- Feature sleep_quality - poor \ average \ good \ to 0, 1, 2
- Feature course - b.tech \ b.sc \ b.com \ bca \ bba \ ba \ diploma can be mapped to 0, 1, 2, 3, 4, 5, 6

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, RobustScaler
from tensorflow.keras import layers, losses, models, optimizers, metrics, callbacks

In [None]:
sns.set_style('whitegrid')

In [None]:
mapping = {
'facility_rating' : {'low': 0, 'medium': 1, 'high': 2 },
'exam_difficulty' : {'easy': 0, 'moderate': 1, 'hard': 2 },
'sleep_quality' : {'poor': 0, 'average': 1, 'good': 2 },
'course' : {'diploma': 0, 'ba': 1, 'bba': 2, 'bca': 3, 'b.com': 4, 'b.sc': 5, 'b.tech': 6 }
}

In [None]:
dtype = {
    'id': "uint16",
    'age': "uint16",
    "study_hours": "float32",
    "class_attendance": "float32",
    "sleep_hours": "float32",
}

In [None]:
train = pd.read_csv('./data/student_test/train.zip', index_col='id', dtype=dtype)
test = pd.read_csv('./data/student_test/test.zip', index_col='id', dtype=dtype)

In [None]:
cat_columns = ['facility_rating', 'exam_difficulty', 'sleep_quality', 'course']

In [None]:
for col in cat_columns:
    train[f'{col}_mapped'] = train[col].map(mapping[col])
    test[f'{col}_mapped'] = test[col].map(mapping[col])

In [None]:
# let's check correlation of numerical features with target
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()

corr = train[num_cols].corr()
plt.figure(figsize=(8,8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
X, y = train.drop(columns=['exam_score']), train['exam_score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
true_num_cols = ['age', 'study_hours', 'class_attendance', 'sleep_hours',]
mapped_cols = [f'{col}_mapped' for col in cat_columns]
feature_cols = true_num_cols + mapped_cols

In [None]:
# let's scale features

scaler = RobustScaler()
X_train[feature_cols] = scaler.fit_transform(X_train[feature_cols])
X_test[feature_cols] = scaler.transform(X_test[feature_cols])
# also scale test set
test[feature_cols] = scaler.transform(test[feature_cols])

In [None]:
lr_model = LinearRegression()

In [None]:
cols = true_num_cols

lr_model.fit(X_train[cols], y_train)
y_pred = lr_model.predict(X_test[cols])

MAE = np.round(np.mean(np.abs(y_test - y_pred)), 2)
RMSE = np.round(np.sqrt(np.mean((y_test - y_pred) ** 2)), 2)
MSE = np.round(np.mean((y_test - y_pred) ** 2), 2)
R2 = np.round(lr_model.score(X_test[cols], y_test), 2)

print(f'Linear Regression MAE: {MAE}, RMSE: {RMSE}, R2: {R2}, MSE: {MSE}')

In [None]:
cols = mapped_cols

lr_model.fit(X_train[cols], y_train)
y_pred = lr_model.predict(X_test[cols])

MAE = np.round(np.mean(np.abs(y_test - y_pred)), 2)
RMSE = np.round(np.sqrt(np.mean((y_test - y_pred) ** 2)), 2)
MSE = np.round(np.mean((y_test - y_pred) ** 2), 2)
R2 = np.round(lr_model.score(X_test[cols], y_test), 2)

print(f'Linear Regression MAE: {MAE}, RMSE: {RMSE}, R2: {R2}, MSE: {MSE}')

In [None]:
cols = feature_cols

lr_model.fit(X_train[cols], y_train)
y_pred = lr_model.predict(X_test[cols])

MAE = np.round(np.mean(np.abs(y_test - y_pred)), 2)
RMSE = np.round(np.sqrt(np.mean((y_test - y_pred) ** 2)), 2)
MSE = np.round(np.mean((y_test - y_pred) ** 2), 2)
R2 = np.round(lr_model.score(X_test[cols], y_test), 2)

print(f'Linear Regression MAE: {MAE}, RMSE: {RMSE}, R2: {R2}, MSE: {MSE}')

In [None]:
# let's check predicted vs real score (10 samples)
comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    '% Difference': np.round(((y_test.values - y_pred) / y_test.values) * 100, 2)
})
print(comparison_df.head(10))

In [None]:
# let's compare results with RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train[feature_cols], y_train)
y_pred_rf = rf_model.predict(X_test[feature_cols])

MAE = np.round(np.mean(np.abs(y_test - y_pred_rf)), 2)
RMSE = np.round(np.sqrt(np.mean((y_test - y_pred_rf) ** 2)), 2)
MSE = np.round(np.mean((y_test - y_pred_rf) ** 2), 2)
R2 = np.round(rf_model.score(X_test[feature_cols], y_test), 2)

print(f'Random Forest Regression MAE: {MAE}, RMSE: {RMSE}, R2: {R2}, MSE: {MSE}'
      )
# Random Forest Regression MAE: 7.75, RMSE: 9.72, R2: 0.73, MSE: 94.52

In [None]:
cat_columns = ['exam_difficulty', 'facility_rating', 'sleep_quality', 'course']
one_hot_columns = ['gender', 'internet_access', 'study_method']
numerical_columns = X.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
# Create a Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(), cat_columns),
        ('onehot', OneHotEncoder(), one_hot_columns),
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
# let's create a dataframes after preprocessing for better visualization
def create_preprocessed_df(X_processed, preprocessor):
    num_features = preprocessor.named_transformers_['num'].get_feature_names_out(numerical_columns)
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_columns)
    onehot_features = preprocessor.named_transformers_['onehot'].get_feature_names_out(one_hot_columns)
    all_features = np.concatenate([num_features, cat_features, onehot_features])
    return pd.DataFrame(X_processed, columns=all_features)

X_train_df = create_preprocessed_df(X_train_processed, preprocessor)
X_test_df = create_preprocessed_df(X_test_processed, preprocessor)

test_df = create_preprocessed_df(preprocessor.transform(test), preprocessor)

In [None]:
# Let's use RandomForestRegressor as a baseline model to detect feature importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_df, y_train)
y_pred = model.predict(X_test_df)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {rmse}') # RMSE: 9.109205066244996
# Training time: 8m 52s

In [None]:
# Feature Importance
feature_importances = pd.Series(model.feature_importances_, index=X_train_df.columns)
feature_importances = feature_importances.sort_values(ascending=False)

In [None]:
# Plot Feature Importance
plt.figure(figsize=(12,8))
sns.barplot(x=feature_importances.values, y=feature_importances.index)
plt.title('Feature Importances from RandomForestRegressor')
plt.tight_layout()
plt.show()

In [None]:
# Let's check Straight Correlation of Numerical Features with Target
numerical_features = X.select_dtypes(include=[np.number]).columns
correlations = {}
for col in numerical_features:
    corr = np.corrcoef(X_train[col], y_train)[0, 1]
    correlations[col] = corr
correlation_series = pd.Series(correlations).sort_values(ascending=False)

# Plot Correlations
plt.figure(figsize=(10,6))
sns.barplot(x=correlation_series.values, y=correlation_series.index)
plt.title('Correlation of Numerical Features with Target')
plt.tight_layout()
plt.show()

In [None]:
# Use Non-Direct Correlation by Spearman method
spearman_correlations = {}
for col in numerical_features:
    corr = X_train[col].corr(y_train, method='spearman')
    spearman_correlations[col] = corr
spearman_correlation_series = pd.Series(spearman_correlations).sort_values(ascending=False)

# Plot Spearman Correlations
plt.figure(figsize=(10,6))
sns.barplot(x=spearman_correlation_series.values, y=spearman_correlation_series.index)
plt.title('Spearman Correlation of Numerical Features with Target')
plt.tight_layout()
plt.show()

In [None]:
# Experiment 1. use top 10 prioritized features from feature importance and correlation analysis
top_features = feature_importances.index[:10].tolist()
X_train_top = X_train_df[top_features]
X_test_top = X_test_df[top_features]

# Train model on top features
model_top = RandomForestRegressor(n_estimators=100, random_state=42)
model_top.fit(X_train_top, y_train)

# Evaluate model
y_pred_top = model_top.predict(X_test_top)
rmse_top = root_mean_squared_error(y_test, y_pred_top)
print(f'RMSE with Top Features: {rmse_top}') # RMSE with Top Features: 9.167744398100497
# Training time: 6m 49s

In [None]:
# Experiment 2. Let's use Optuna to find best hyperparameters for GradientBoostingRegressor
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }
    model = GradientBoostingRegressor(**params, random_state=42)
    model.fit(X_train_df, y_train)
    y_pred = model.predict(X_test_df)
    rmse = root_mean_squared_error(y_test, y_pred)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

# [I 2026-01-08 21:01:48,315] Trial 3 finished with value: 8.72970744572712 and parameters: {'n_estimators': 185, 'learning_rate': 0.20036347173815644, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 10}. Best is trial 3 with value: 8.72970744572712.

In [None]:
shape =  X_train_df.shape[1]

model = models.Sequential([
    layers.InputLayer(shape=[shape]),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(32),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(1, activation='softplus')
])

In [None]:
early_stopping_callback = callbacks.EarlyStopping(
    monitor='val_root_mean_squared_error',
    patience=5,
    restore_best_weights=True
)

model.compile(loss=losses.MeanSquaredError(), # loss function
              optimizer=optimizers.Adam(learning_rate=0.01, epsilon=0.00001),                        # optimizer
              metrics=[metrics.RootMeanSquaredError(),
                       metrics.R2Score(),
                       metrics.MeanSquaredError(),
                       metrics.MeanAbsoluteError()])                   # metrics to monitor

history = model.fit(X_train_df,
                    y_train,
                    batch_size=128,
                    epochs=10,
                    validation_data=(X_test_df, y_test),
                    callbacks=[early_stopping_callback],
                    verbose=2)

# Evaluate the model

In [None]:
y_pred = model.predict(X_test_df)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE from Neural Network: {rmse}') # RMSE from Neural Network: 8.5123456789

# Prepare submission

In [None]:
exam_score = model.predict(test_df)

In [None]:
exam_score

In [None]:
submission = pd.DataFrame(data={
    'id': test.index,
    'exam_score': exam_score.flatten()
})

In [None]:
submission.to_csv('./submissions/student_test_score_prediction_nn.csv', index=False)
