In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor


In [21]:
dataset  =  pd.read_excel('data_for_hvp_rankings.xlsx')

In [22]:
X = dataset["Message"]
y = dataset[['VQ A 6', 'VQ B 9', 'VQ C 10', 'VQ D 11',
       'VQ E 13', 'VQ F 5', 'VQ G 17', 'VQ H 16', 'VQ I 12', 'VQ J 4',
       'VQ K 1', 'VQ L 18', 'VQ M 2', 'VQ N 14', 'VQ O 8', 'VQ P 15', 'VQ Q 3',
       'VQ R 7', 'SQ A 6', 'SQ B 9', 'SQ C 10', 'SQ D 11', 'SQ E 13', 'SQ F 5',
       'SQ G 17', 'SQ H 16', 'SQ I 12', 'SQ J 4', 'SQ K 1', 'SQ L 18',
       'SQ M 2', 'SQ N 14', 'SQ O 8', 'SQ P 15', 'SQ Q 3', 'SQ R 7']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)



#Result: ~70% train, 30% test
print(f"Training: {len(X_train)} records")
print(f"Testing: {len(X_test)} records")

Training: 466 records
Testing: 201 records


In [23]:

def build_and_evaluate_model(X_train, y_train, feature_selection=False, k_features=50000):
    
    # Feature extraction
    steps = [
        ("features", FeatureUnion([

            ("word_tfidf", TfidfVectorizer(
                analyzer="word",
                ngram_range=(1,2),
                min_df=3,
                max_df=0.95,
                sublinear_tf=True,
                strip_accents="unicode"
            )),

            ("char_tfidf", TfidfVectorizer(
                analyzer="char",
                ngram_range=(3,5),
                min_df=3
            ))

        ]))
    ]
    
    # Optional feature selection
    if feature_selection:
        steps.append(("chi2", SelectKBest(chi2, k=k_features)))
    
    # XGBoost regressor (wrapped for multi-output)
    xgb = MultiOutputRegressor(
        XGBRegressor(
            n_estimators=400,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1
        )
    )
    
    steps.append(("reg", xgb))
    
    pipeline = Pipeline(steps)
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    return pipeline


In [None]:
model = build_and_evaluate_model(
    X_train, 
    y_train,
    feature_selection=False
)


In [None]:
y_pred = model.predict(X_test)


In [None]:
r2_score(y_test,y_pred)

0.9839458465576172

In [12]:
from sklearn.metrics import r2_score, mean_absolute_error

print("R²:", r2_score(y_train, y_pred, multioutput='uniform_average'))
print("MAE:", mean_absolute_error(y_train, y_pred, multioutput='uniform_average'))


R²: 0.9839458465576172
MAE: 0.10822971165180206
