# Benchmarking with TF-IDF

In [2]:
import numpy as np
import wandb
from datasets import DatasetDict, Dataset, load_dataset
import huggingface_hub
import os
import pandas as pd

# Hugging Face and Weights & Biases setup
huggingface_username = 'HSLU-AICOMP-LearningAgencyLab'
competition = 'learning-agency-lab-automated-essay-scoring-2_V2'

# Login to Hugging Face and W&B
print("Logging in to Hugging Face Hub and W&B...")
huggingface_hub.login(token=os.getenv('HUGGINGFACE_TOKEN'))
print("Login successful.")

wandb_project = 'HSLU-AICOMP-LearningAgencyLab'
wandb_entity = 'bunduli'
wandb.login(key=os.getenv('WANDB_API_TOKEN'))

Logging in to Hugging Face Hub and W&B...
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: write).
Your token has been saved to /Users/bundeli/.cache/huggingface/token
Login successful
Login successful.


[34m[1mwandb[0m: Currently logged in as: [33mbunduli[0m ([33mhslu_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/bundeli/.netrc


True

## Dataset preparation
### Load the dataset

In [None]:
# Load the entire dataset from Hugging Face
print("Loading the entire dataset from Hugging Face...")
dataset = load_dataset(f"{huggingface_username}/{competition}")
print("Dataset loaded successfully.")

# Inspect the dataset
print("Inspecting the dataset...")
print(dataset)
print(dataset['train'][0])

Loading the entire dataset from Hugging Face...
Dataset loaded successfully.
Inspecting the dataset...
DatasetDict({
    train: Dataset({
        features: ['essay_id', 'full_text', 'score', 'unique_mistakes', 'repeated_mistakes_count', 'max_repeated_mistake', 'word_count', 'flesch_reading_ease', 'flesch_kincaid_grade', 'sentence_count', 'average_sentence_length', 'pos_noun_count', 'pos_verb_count', 'pos_adj_count', 'pos_adv_count', 'grammar_error_count', 'syntactic_complexity', 'spelling_mistake_count', 'error_density', 'tfidf_keywords_vector', 'lda_topic_vector', 'keyword_coverage', 'pronoun_usage', 'unique_word_proportion', 'long_word_proportion', 'imagery_word_proportion', 'positive_sentiment_score', 'negative_sentiment_score', 'visual_word_proportion', 'unique_visual_word_proportion', 'average_imagery_score', 'discourse_marker_count', 'neural_coherence_score', 'longformer_sentence_embedding', 'longformer_coherence_score', 'type_token_ratio', 'lexical_diversity', 'vocabulary_maturi

### Create different TF-IDF Variations

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

train_corpus = [text for text in dataset['train']['full_text']]
eval_corpus = [text for text in dataset['eval']['full_text']]
test_corpus = [text for text in dataset['test']['full_text']]

# Define the number of top keywords for TF-IDF and the number of components for SVD
CONFIGURATIONS = [
    (500, None), (500, 250), # Also tried (20,10), (50,25)
    (300, None), (300, 150), 
    (100, None), (100, 50),
]

datasets = {}

for TOP_N_KEYWORDS, N_COMPONENTS in CONFIGURATIONS:
    vectorizer = TfidfVectorizer(stop_words='english', max_features=TOP_N_KEYWORDS)
    
    # Fit and transform on train data, transform on eval and test
    tfidf_train = vectorizer.fit_transform(train_corpus)
    tfidf_eval = vectorizer.transform(eval_corpus)
    tfidf_test = vectorizer.transform(test_corpus)

    if N_COMPONENTS is None:
        datasets[f'X_train_{TOP_N_KEYWORDS}'] = tfidf_train.toarray()
        datasets[f'X_eval_{TOP_N_KEYWORDS}'] = tfidf_eval.toarray()
    else:
        svd = TruncatedSVD(n_components=N_COMPONENTS, random_state=42)
        tfidf_train_reduced = svd.fit_transform(tfidf_train)
        tfidf_eval_reduced = svd.transform(tfidf_eval)

        datasets[f'X_train_{TOP_N_KEYWORDS}_{N_COMPONENTS}'] = tfidf_train_reduced
        datasets[f'X_eval_{TOP_N_KEYWORDS}_{N_COMPONENTS}'] = tfidf_eval_reduced

y_train = np.array([row['score'] for row in dataset['train']])
y_eval = np.array([row['score'] for row in dataset['eval']])

X_train_500 = datasets['X_train_500']
X_eval_500 = datasets['X_eval_500']
X_train_500_250 = datasets['X_train_500_250']
X_eval_500_250 = datasets['X_eval_500_250']
X_train_100 = datasets['X_train_100']
X_eval_100 = datasets['X_eval_100']
X_train_100_50 = datasets['X_train_100_50']
X_eval_100_50 = datasets['X_eval_100_50']
X_train_300 = datasets['X_train_300']
X_eval_300 = datasets['X_eval_300']
X_train_300_150 = datasets['X_train_300_150']
X_eval_300_150 = datasets['X_eval_300_150']


In [None]:
print("Shapes of the datasets:")
print(X_train_500.shape, y_train.shape, X_eval_500.shape, y_eval.shape)
print(X_train_500_250.shape, X_eval_500_250.shape)
print(X_train_100.shape, X_eval_100.shape)
print(X_train_100_50.shape, X_eval_100_50.shape)
print(X_train_300.shape, X_eval_300.shape)
print(X_train_300_150.shape, X_eval_300_150.shape)

print("First row of the dataset:")
print(X_train_500[0])
print(X_train_100[0])

Shapes of the datasets:
(13845, 500) (13845,) (3462, 500) (3462,)
(13845, 250) (3462, 250)
(13845, 100) (3462, 100)
(13845, 50) (3462, 50)
(13845, 300) (3462, 300)
(13845, 150) (3462, 150)
First row of the dataset:
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.10046221 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.0671438
 0.         0.         0.         0.         0.         0.0490275
 0.         0.         0.         0.05641442 0.04394163 0.
 0.19901684 0.         0.12089174 0.         0.         0.
 0.         0.05922208 0.         0.         0.         0.
 0. 

## KNN Model Training

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import cohen_kappa_score

sweep_config_knn = {
    'method': 'random',
    'metric': {'name': 'qwk', 'goal': 'maximize'},
    'parameters': {
        'n_neighbors': {
            'values': [3, 9, 15, 21, 27]
        },
        'weights': {
            'values': ['uniform', 'distance']
        },
        'dataset': {
            'values': [
                'X_train_500', 'X_train_500_250',
                'X_train_100', 'X_train_100_50',
                'X_train_300', 'X_train_300_150'
            ]
        }
    }
}

sweep_id_knn = wandb.sweep(sweep_config_knn, project="HSLU-AICOMP-LearningAgencyLab")

def train_knn():
    wandb.init()
    config = wandb.config

    # Select the appropriate dataset based on the configuration
    X_train = datasets[config.dataset]
    X_eval = datasets[config.dataset.replace('train', 'eval')]

    model = KNeighborsClassifier(
        n_neighbors=config.n_neighbors,
        weights=config.weights
    )
    wandb.run.name = f"KNN_n={config.n_neighbors}_weights={config.weights}_dataset={config.dataset}"
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_eval)
    qwk = cohen_kappa_score(y_eval, predictions, weights='quadratic')
    wandb.log({'qwk': qwk})

wandb.agent(sweep_id_knn, train_knn, project='HSLU-AICOMP-LearningAgencyLab', count=50)
wandb.finish()
wandb.teardown()

### Load the best model to HuggingFace
Best configuration: KNN_n=21_weights=uniform_dataset=X_train_500_250

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import cohen_kappa_score

best_model_config = {
    'n_neighbors': 21,
    'weights': 'uniform',
    'dataset': 'X_train_500_250'
}

X_train = datasets[best_model_config['dataset']]
X_eval = datasets[best_model_config['dataset'].replace('train', 'eval')]

best_model = KNeighborsClassifier(
    n_neighbors=best_model_config['n_neighbors'],
    weights=best_model_config['weights']
)

best_model.fit(X_train, y_train)

predictions = best_model.predict(X_eval)
qwk = cohen_kappa_score(y_eval, predictions, weights='quadratic')
print(f'Best model QWK: {qwk}')

Best model QWK: 0.4101747701838122


### Load best Model to Kaggle

In [None]:
import kagglehub
import shutil
import joblib

our_model_name = "automated-essay-scoring-knn"
VARIATION_SLUG = 'default'
LOCAL_MODEL_DIR = f"../src/models/{our_model_name}"
best_model_path = f"{LOCAL_MODEL_DIR}/model.joblib"

if os.path.exists(LOCAL_MODEL_DIR):
    if os.path.isfile(LOCAL_MODEL_DIR):
        os.remove(LOCAL_MODEL_DIR)  
    else:
        shutil.rmtree(LOCAL_MODEL_DIR)  
os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)

# Save the best model using joblib
joblib.dump(best_model, best_model_path)

# Compress the model directory (optional but helpful for large files)
shutil.make_archive(our_model_name, 'zip', LOCAL_MODEL_DIR)

# Upload the model to Kaggle using kagglehub
kagglehub.model_upload(
    handle=f"jannikbundeli/{our_model_name}/scikitLearn/{VARIATION_SLUG}",
    local_model_dir=LOCAL_MODEL_DIR,
    version_notes=f"QWK: {str(qwk)}"
)

Uploading Model https://www.kaggle.com/models/jannikbundeli/automated-essay-scoring-knn/scikitLearn/default ...
Model 'automated-essay-scoring-knn' does not exist or access is forbidden for user 'jannikbundeli'. Creating or handling Model...
Model 'automated-essay-scoring-knn' Created.
Starting upload for file ../src/models/automated-essay-scoring-knn/model.joblib


Uploading: 100%|██████████| 27.8M/27.8M [00:02<00:00, 9.90MB/s]

Upload successful: ../src/models/automated-essay-scoring-knn/model.joblib (27MB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/jannikbundeli/automated-essay-scoring-knn/scikitLearn/default
