In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from datasets import load_dataset
from tqdm.notebook import tqdm

# Optional: for QWK
!pip install --quiet scikit-learn scipy
from sklearn.metrics import cohen_kappa_score

# Load your cleaned CSV
df = pd.read_csv("../Data/Processed/asap_cleaned.csv")
df.head()


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,word_count,min_score,max_score,score_scaled
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,338,2,12,0.6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,419,2,12,0.7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,279,2,12,0.5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,524,2,12,0.8
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,465,2,12,0.6


In [3]:
# Use 'essay' column as input text
texts = df['essay'].astype(str).values

# Use normalised score as target (assumes already scaled to [0, 1])
labels = df['score_scaled'].values

In [4]:
# First split: Train + Temp (for val/test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

# Second split: Temp → Validation + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")


Train size: 9651
Validation size: 1206
Test size: 1207


In [5]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)


In [6]:
ridge = Ridge(alpha=1.0)  # You can tune alpha later if desired
ridge.fit(X_train_vec, y_train)

# Predict on validation and test
val_preds_ridge = ridge.predict(X_val_vec)
test_preds_ridge = ridge.predict(X_test_vec)


In [7]:
def qwk(y_true, y_pred, min_rating=0, max_rating=1):
    """
    Quadratic Weighted Kappa. Assumes inputs scaled to [0, 1].
    For scoring purposes, predictions are mapped back to 0-12 scale (ASAP-style).
    """
    y_pred_rounded = np.round(y_pred * 12).astype(int)
    y_true_rounded = np.round(y_true * 12).astype(int)
    return cohen_kappa_score(y_true_rounded, y_pred_rounded, weights="quadratic")

mse_ridge = mean_squared_error(y_test, test_preds_ridge)
qwk_ridge = qwk(y_test, test_preds_ridge)

print(f"Ridge Regression - MSE: {mse_ridge:.4f}, QWK: {qwk_ridge:.4f}")


Ridge Regression - MSE: 0.0237, QWK: 0.6818


In [8]:
svr = SVR(kernel="rbf", C=1.0, epsilon=0.1)
svr.fit(X_train_vec, y_train)

# Predict
val_preds_svr = svr.predict(X_val_vec)
test_preds_svr = svr.predict(X_test_vec)


In [9]:
mse_svr = mean_squared_error(y_test, test_preds_svr)
qwk_svr = qwk(y_test, test_preds_svr)

print(f"Support Vector Regression - MSE: {mse_svr:.4f}, QWK: {qwk_svr:.4f}")


Support Vector Regression - MSE: 0.0227, QWK: 0.6817


In [10]:
rf = RandomForestRegressor(
    n_estimators=100,        # Number of trees
    max_depth=None,          # You can limit this to avoid overfitting
    random_state=42,
    n_jobs=-1                # Use all available cores
)
rf.fit(X_train_vec, y_train)

# Predict
val_preds_rf = rf.predict(X_val_vec)
test_preds_rf = rf.predict(X_test_vec)


In [11]:
mse_rf = mean_squared_error(y_test, test_preds_rf)
qwk_rf = qwk(y_test, test_preds_rf)

print(f"Random Forest Regressor - MSE: {mse_rf:.4f}, QWK: {qwk_rf:.4f}")


Random Forest Regressor - MSE: 0.0280, QWK: 0.5589


In [12]:
results_df = pd.DataFrame({
    "Model": ["Ridge Regression", "Support Vector Regression", "Random Forest Regressor"],
    "MSE": [mse_ridge, mse_svr, mse_rf],
    "QWK": [qwk_ridge, qwk_svr, qwk_rf]
})

# Round for cleaner display
results_df = results_df.round(4)
display(results_df)


Unnamed: 0,Model,MSE,QWK
0,Ridge Regression,0.0237,0.6818
1,Support Vector Regression,0.0227,0.6817
2,Random Forest Regressor,0.028,0.5589
