In [2]:
# !pip install -U scikit-learn pandas xgboost openpyxl

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor


In [2]:
dataset  =  pd.read_excel('data_for_hvp_rankings.xlsx')

In [3]:
X = dataset["Message"]
y = dataset[['VQ A 6', 'VQ B 9', 'VQ C 10', 'VQ D 11',
       'VQ E 13', 'VQ F 5', 'VQ G 17', 'VQ H 16', 'VQ I 12', 'VQ J 4',
       'VQ K 1', 'VQ L 18', 'VQ M 2', 'VQ N 14', 'VQ O 8', 'VQ P 15', 'VQ Q 3',
       'VQ R 7', 'SQ A 6', 'SQ B 9', 'SQ C 10', 'SQ D 11', 'SQ E 13', 'SQ F 5',
       'SQ G 17', 'SQ H 16', 'SQ I 12', 'SQ J 4', 'SQ K 1', 'SQ L 18',
       'SQ M 2', 'SQ N 14', 'SQ O 8', 'SQ P 15', 'SQ Q 3', 'SQ R 7']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)



#Result: ~70% train, 30% test
print(f"Training: {len(X_train)} records")
print(f"Testing: {len(X_test)} records")

Training: 466 records
Testing: 201 records


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import FeatureUnion
import joblib
import scipy.sparse as sp

def build_feature_extractor(feature_selection=False, k_features=50000):

    word_tfidf = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.95,
        strip_accents="unicode",
        sublinear_tf=True
    )

    char_tfidf = TfidfVectorizer(
        analyzer="char",
        ngram_range=(3, 5),
        min_df=3
    )

    fe = FeatureUnion([
        ("word_tfidf", word_tfidf),
        ("char_tfidf", char_tfidf)
    ])

    return fe


In [9]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

def train_xgboost_pipeline(X_text, y, feature_selection=False, k_features=50000):

    # 1) Build feature extractor
    fe = build_feature_extractor(feature_selection, k_features)

    # 2) Fit TF-IDF + transform to sparse matrix
    print("Fitting TF-IDF...")
    X_tfidf = fe.fit_transform(X_text)

    print("Shape after TFIDF:", X_tfidf.shape)

    # 3) Optional Chi-Square
    if feature_selection:
        selector = SelectKBest(chi2, k=k_features)
        print("Applying Chi-Square feature selection...")
        X_tfidf = selector.fit_transform(X_tfidf, y)
        print("Shape after Chi2:", X_tfidf.shape)
    else:
        selector = None

    # 4) Build XGBoost multi-output
    print("Training XGBoost...")
    
    xgb = MultiOutputRegressor(
        XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.9,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1
        )
    )

    xgb.fit(X_tfidf, y)

    # 5) Save everything
    joblib.dump({
        "feature_extractor": fe,
        "selector": selector,
        "xgb_model": xgb
    }, "tfidf_xgb_pipeline.pkl")

    print("Model saved as tfidf_xgb_pipeline.pkl")
    return fe, selector, xgb


In [10]:
def load_xgb_pipeline(path="tfidf_xgb_pipeline.pkl"):
    saved = joblib.load(path)
    return saved["feature_extractor"], saved["selector"], saved["xgb_model"]


def predict_traits(text_list, fe, selector, model):
    X = fe.transform(text_list)
    if selector:
        X = selector.transform(X)
    return model.predict(X)


In [None]:
model = train_xgboost_pipeline(
    X_train, 
    y_train
)


Fitting TF-IDF...
Shape after TFIDF: (466, 168776)
Training XGBoost...


In [None]:
fe, selector, xgb = load_xgb_pipeline()
pred = predict_traits(["This is a sample text"], fe, selector, xgb)
print(pred)


In [None]:
y_pred = model.predict(X_test)


In [None]:
r2_score(y_test,y_pred)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

print("R²:", r2_score(y_train, y_pred, multioutput='uniform_average'))
print("MAE:", mean_absolute_error(y_train, y_pred, multioutput='uniform_average'))


In [None]:
import joblib

joblib.dump(pipeline, "tfidf_xgboost_pipeline.pkl")


In [None]:
pipeline = joblib.load("tfidf_xgboost_pipeline.pkl")


In [None]:
y_pred = pipeline.predict(["this is a new text sample for prediction"])


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

r2_scores = [r2_score(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]

plt.figure(figsize=(14,6))
plt.bar(range(36), r2_scores)
plt.xlabel("Trait Index")
plt.ylabel("R² Score")
plt.title("Trait-wise R² Performance")
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error

mae_scores = [mean_absolute_error(y_test[:, i], y_pred[:, i]) for i in range(36)]

plt.figure(figsize=(14,6))
plt.bar(range(36), mae_scores)
plt.xlabel("Trait Index")
plt.ylabel("MAE")
plt.title("Trait-wise MAE")
plt.show()


In [None]:
trait_idx = 0  # change 0–35

plt.figure(figsize=(6,6))
plt.scatter(y_test[:, trait_idx], y_pred[:, trait_idx], alpha=0.5)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title(f"True vs Predicted (Trait {trait_idx})")
plt.plot([min(y_test[:,trait_idx]), max(y_test[:,trait_idx])],
         [min(y_test[:,trait_idx]), max(y_test[:,trait_idx])],
         color='red')  # perfect-fit line
plt.show()


In [None]:
errors = y_test[:, trait_idx] - y_pred[:, trait_idx]

plt.figure(figsize=(6,6))
plt.scatter(y_pred[:, trait_idx], errors, alpha=0.5)
plt.axhline(0, color='red')
plt.xlabel("Predicted Value")
plt.ylabel("Residual (True - Pred)")
plt.title(f"Residual Plot (Trait {trait_idx})")
plt.show()


In [None]:
import seaborn as sns
import numpy as np

corr_true = np.corrcoef(y_test.T)
corr_pred = np.corrcoef(y_pred.T)

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
sns.heatmap(corr_true, cmap='coolwarm')
plt.title("True Trait Correlations")

plt.subplot(1,2,2)
sns.heatmap(corr_pred, cmap='coolwarm')
plt.title("Predicted Trait Correlations")

plt.show()


In [None]:
import pandas as pd

df_results = pd.DataFrame({
    "Trait": list(range(36)),
    "R²": r2_scores,
    "MAE": mae_scores
})

df_results


In [None]:
import numpy as np
import matplotlib.pyplot as plt

true_mean = y_test.mean(axis=0)
pred_mean = y_pred.mean(axis=0)

angles = np.linspace(0, 2 * np.pi, len(true_mean), endpoint=False).tolist()
angles += angles[:1]

true_plot = np.concatenate((true_mean, [true_mean[0]]))
pred_plot = np.concatenate((pred_mean, [pred_mean[0]]))

plt.figure(figsize=(8,8))
ax = plt.subplot(111, polar=True)

ax.plot(angles, true_plot, linewidth=2, label="True")
ax.fill(angles, true_plot, alpha=0.25)

ax.plot(angles, pred_plot, linewidth=2, label="Predicted")
ax.fill(angles, pred_plot, alpha=0.25)

plt.title("Mean Trait Profile: True vs Predicted")
plt.legend(loc="upper right")
plt.show()
