To do: Add stemming, remove common words, confusion matrix, xgboost, remove extra symbols

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import re
from collections import Counter

Import Training Data and Filter Empty/Invalid Entries

In [2]:
N = 5000
DATA_PATH = "https://raw.githubusercontent.com/KevorkSulahian/ML_journey/main/random_code/NM/train.csv.zip"

train = pd.read_csv(DATA_PATH, compression="zip").iloc[:N]
trainFiltered = train[
    train['question1'].apply(lambda x: isinstance(x, str) and x.strip() != '') &
    train['question2'].apply(lambda x: isinstance(x, str) and x.strip() != '')
].reset_index(drop=True)

Import WordNet Synonyms and Stemming Function

In [13]:
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\kevor\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Create DataFrame of Synsets (canonical synonyms to replace words)

In [4]:
def makeSynonymDf(synsets):
    word_list = []
    wordSynonyms = []

    for synset in synsets:
        word = synset.name().split('.')[0].replace('_', ' ')
        synonyms = [lemma.replace('_', ' ') for lemma in synset.lemma_names()]

        word_list.append(word)
        wordSynonyms.append(synonyms)

    wordDf = pd.DataFrame({'word': word_list, 'synonyms': wordSynonyms})
    return wordDf

In [5]:
def build_synonym_map(df):
    synonym_map = {}
    for _, row in df.iterrows():
        syn_list = row['synonyms']
        if isinstance(syn_list, list) and syn_list:
            canonical = syn_list[0].lower()  # first word is the canonical one
            for word in syn_list:
                synonym_map[word.lower()] = canonical
    return synonym_map

In [6]:
'''nounSynsets = list(wn.all_synsets('n'))
nounDf = makeSynonymDf(nounSynsets)

verbSynsets = list(wn.all_synsets('v'))
verbDf = makeSynonymDf(verbSynsets)

adjSynsets = list(wn.all_synsets('a'))
adjDf = makeSynonymDf(adjSynsets)'''

allSynsets = list(wn.all_synsets())
wordDf = makeSynonymDf(allSynsets)

english_stopwords = set(stopwords.words("english"))
query_words = {'who', 'what', 'where', 'when', 'why', 'how', 'which', 'will', 'whose', 'can'}
negation_words = {"no", "not", "never", "none", "neither", "nor", "nothing", "cannot"}

In [7]:
synonym_map = build_synonym_map(wordDf)

Process String and Tokenize Words with Stemming+Canonical Synonym Replacement

In [10]:
def normalize_and_tokenize_with_stemming(question, synonym_map):
    question = question.lower()
    question = re.sub(r"n't", " not", question)

    # Remove all non-alphanumeric characters (symbols like #, @, $, etc.)
    question = re.sub(r'[^a-z0-9\s]', '', question)
    tokens = re.findall(r'\b\w+\b', question)

    # Remove stopwords
    filtered_tokens = {
        w for w in tokens if w not in english_stopwords or w in negation_words or w in query_words
    }

    stemmed = [
        w if w in negation_words or w in query_words else stemmer.stem(w)
        for w in filtered_tokens
    ]

    # Replace with canonical synonyms only if not in negation/query words
    norm_tokens = [
        w if w in negation_words or w in query_words else synonym_map.get(w, w)
        for w in stemmed
        ]

    tags = pos_tag(filtered_tokens)
    nouns = {word for word, tag in tags if tag.startswith('NN')}
    verbs = {word for word, tag in tags if tag.startswith('VB')}
    adjectives = {word for word, tag in tags if tag.startswith('JJ')}
    stemmedNouns = [stemmer.stem(w)for w in nouns]
    stemmedVerbs = [stemmer.stem(w)for w in verbs]
    stemmedAdjectives = [stemmer.stem(w)for w in adjectives]

    return set(norm_tokens), set(stemmedNouns), set(stemmedVerbs), set(stemmedAdjectives)

In [11]:
trainFiltered.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [16]:
trainNormalized = trainFiltered.copy()
import nltk
nltk.data.path.append('C:\\Users\\kevor\\nltk_data')  # Adjust the path to where NLTK data is stored
import nltk
nltk.download('averaged_perceptron_tagger', force=True)
trainNormalized[['tokens1', 'nouns1', 'verbs1', 'adjectives1']] = trainNormalized['question1'].apply(
    lambda question: pd.Series(normalize_and_tokenize_with_stemming(question, synonym_map))
)

trainNormalized[['tokens2', 'nouns2', 'verbs2', 'adjectives2']] = trainNormalized['question2'].apply(
    lambda question: pd.Series(normalize_and_tokenize_with_stemming(question, synonym_map))
)

trainNormalized = trainNormalized[
    trainNormalized['tokens1'].apply(lambda x: isinstance(x, set) and len(x) > 0) &
    trainNormalized['tokens2'].apply(lambda x: isinstance(x, set) and len(x) > 0)
].reset_index(drop=True)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kevor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [41]:
trainNormalized.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
       'tokens1', 'nouns1', 'verbs1', 'adjectives1', 'tokens2', 'nouns2',
       'verbs2', 'adjectives2'],
      dtype='object')

In [49]:
import os, gc, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import scipy.sparse as sp
from scipy.sparse import vstack, hstack
import torch
from sentence_transformers import SparseEncoder, SentenceTransformer, CrossEncoder
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import StackingClassifier
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [57]:
import os, gc, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import scipy.sparse as sp
from scipy.sparse import vstack, hstack
import torch

from sentence_transformers import SparseEncoder, SentenceTransformer, CrossEncoder
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance
from xgboost import XGBClassifier

def run_duplicate_pipeline(
    q1_list: list,
    q2_list: list,
    y: np.ndarray,
    topk: int = 100,
    n_svd: int = 1500,
    n_pca: int = 350,
    max_features: int = 50000,
    random_state: int = 42,
    device: str = None,
    output_dir: str = "results"
) -> dict:
    """
    Runs the duplicate-question pipeline with an 80/20 train/test split,
    saves all diagnostic plots to `output_dir`, and returns metrics + models.

    Args:
        q1_list, q2_list: full lists of question texts.
        y: full label array.
        topk: keep top-k non-zero entries per SPLADE batch.
        n_svd: components for TruncatedSVD.
        n_pca: components for PCA.
        max_features: for TF-IDF.
        random_state: seed.
        device: 'cuda' or 'cpu'. Auto-detect if None.
        output_dir: directory to save all plots.
    Returns:
        dict containing:
          - stack_model
          - auc_xe_train, auc_stack_train
          - auc_xe_test, auc_stack_test
          - X_train, X_test
    """
    os.makedirs(output_dir, exist_ok=True)

    # 0. Train/test split (80/20)
    q1_train, q1_test, q2_train, q2_test, y_train, y_test = train_test_split(
        q1_list, q2_list, y, test_size=0.2, 
        stratify=y, random_state=random_state
    )
    N = len(q1_train)

    # 1. Device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 2. Instantiate models
    sparse_model = SparseEncoder("naver/splade-cocondenser-ensembledistil",
                                 device=device, trust_remote_code=True)
    dense_model  = SentenceTransformer("all-MiniLM-L6-v2", device=device)
    xe_model     = CrossEncoder("cross-encoder/quora-distilroberta-base",
                                device=device, num_labels=1)
    tfidf        = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))

    # 3. Helper: sparse CSR
    def get_sparse_csr(sentences, model):
        chunks = []
        for i in tqdm(range(0, len(sentences), 128), desc="Sparse encode"):
            batch = sentences[i:i+128]
            emb = model.encode(batch, batch_size=128, convert_to_numpy=False)
            coo = emb.coalesce()
            rows, cols = coo.indices()[0].cpu().numpy(), coo.indices()[1].cpu().numpy()
            vals = coo.values().cpu().numpy()
            if topk:
                idx = np.argsort(vals)[-topk:]
                rows, cols, vals = rows[idx], cols[idx], vals[idx]
            chunks.append(sp.coo_matrix((vals,(rows,cols)), shape=coo.shape).tocsr())
            del emb, coo, rows, cols, vals
            torch.cuda.empty_cache()
        return vstack(chunks)

    ### — TRAIN FEATURE EXTRACTION — ###
    flat_train = [q for pair in zip(q1_train, q2_train) for q in pair]

    # Sparse + SVD
    sparse_full = get_sparse_csr(flat_train, sparse_model)
    s1, s2 = sparse_full[:N], sparse_full[N:]
    svd = TruncatedSVD(n_components=n_svd, random_state=random_state)
    X_sp_train = svd.fit_transform(hstack([abs(s1-s2), s1.multiply(s2)]))

    # Dense + PCA
    emb_dense = dense_model.encode(flat_train,
                                   batch_size=64,
                                   convert_to_tensor=True,
                                   device=device)
    d1, d2 = emb_dense[:N].cpu().numpy(), emb_dense[N:].cpu().numpy()
    pca = PCA(n_components=n_pca, random_state=random_state)
    X_dn_train = pca.fit_transform(np.hstack([np.abs(d1-d2), d1*d2]))
    cosims_train = np.diag(cosine_similarity(d1, d2))

    # TF-IDF + lexical
    tfidf.fit(flat_train)
    v1_t, v2_t = tfidf.transform(q1_train), tfidf.transform(q2_train)
    tfidf_sim_train = cosine_similarity(v1_t, v2_t).diagonal()
    jacc_train, lev_train = [], []
    for a,b in zip(q1_train, q2_train):
        t1, t2 = set(word_tokenize(a.lower())), set(word_tokenize(b.lower()))
        jacc_train.append(len(t1&t2)/(len(t1|t2)+1e-8))
        lev_train.append(1 - edit_distance(a,b)/max(len(a),len(b),1))
    jacc_train, lev_train = np.array(jacc_train), np.array(lev_train)

    # Cross-Encoder
    scores_train = xe_model.predict(list(zip(q1_train, q2_train)), batch_size=32)
    xe_feat_train = (scores_train - scores_train.min())/(scores_train.max()-scores_train.min())
    auc_xe_train = roc_auc_score(y_train, xe_feat_train)

    # Save TRAIN diagnostics
    def save_fig(fn):
        fig = plt.gcf()
        fig.savefig(os.path.join(output_dir, fn), bbox_inches='tight')
        plt.close(fig)

    # Class balance
    pd.Series(y_train).value_counts().plot.bar(title="Train Duplicate vs Non-dup")
    save_fig("train_class_balance.png")

    # Length hists
    for name, lst in zip(["q1","q2"], [q1_train, q2_train]):
        plt.hist([len(q) for q in lst], bins=30)
        plt.title(f"Train {name} lengths")
        save_fig(f"train_len_{name}.png")

    # SPLADE sparsity
    density = sparse_full.nnz/(sparse_full.shape[0]*sparse_full.shape[1])
    # (we'll save the numeric density to a text file)
    with open(os.path.join(output_dir, "train_splade_density.txt"), "w") as f:
        f.write(f"{density:.6f}")

    rows, cols = sparse_full.nonzero()
    sel = np.random.rand(len(rows))<0.01
    plt.scatter(cols[sel], rows[sel], s=0.5, alpha=0.1)
    plt.title("Train SPLADE 1% sparsity")
    save_fig("train_splade_pattern.png")
    plt.hist(cols, bins=1000)
    plt.title("Train Active Vocab IDs")
    save_fig("train_splade_vocab_hist.png")
    plt.plot(np.cumsum(svd.explained_variance_ratio_))
    plt.title("Train Sparse SVD var")
    save_fig("train_svd_variance.png")

    # SBERT & PCA
    plt.hist(cosims_train, bins=30)
    plt.title("Train SBERT cosine sim")
    save_fig("train_cosine_sim.png")
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.title("Train PCA var")
    save_fig("train_pca_variance.png")

    # Lexical
    for arr,name in [(tfidf_sim_train,"tfidf"), (jacc_train,"jaccard"), (lev_train,"levenshtein")]:
        plt.hist(arr, bins=30)
        plt.title(f"Train {name} sim")
        save_fig(f"train_{name}_hist.png")

    # Cross-Encoder
    plt.hist(scores_train, bins=30)
    plt.title("Train XE raw scores")
    save_fig("train_xe_raw.png")
    neg, pos = xe_feat_train[y_train==0], xe_feat_train[y_train==1]
    plt.hist([neg,pos], bins=30, stacked=True)
    plt.title("Train XE by label")
    save_fig("train_xe_by_label.png")

    # Assemble features
    X_train = np.hstack([
        X_sp_train,
        X_dn_train,
        tfidf_sim_train.reshape(-1,1),
        jacc_train.reshape(-1,1),
        lev_train.reshape(-1,1),
        xe_feat_train.reshape(-1,1),
    ])

    # 4. Modeling (same as before)
    CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    param_dist = {
        'n_estimators': [400,500], 'max_depth': [4,6,8],
        'learning_rate': [0.01,0.05,0.1], 'subsample': [0.6,0.8,1.0],
        'colsample_bytree': [0.6,0.8,1.0],
        'scale_pos_weight': [1, sum(y_train==1)/sum(y_train==0)]
    }
    xgb_base = XGBClassifier(
        tree_method = 'gpu_hist' if device=='cuda' else 'hist',
        device      = device,
        eval_metric = 'auc',
        random_state= random_state
    )
    rs = RandomizedSearchCV(
        xgb_base, param_dist, n_iter=10,
        scoring='roc_auc', cv=CV, n_jobs=1, random_state=random_state
    )
    rs.fit(X_train, y_train)
    best = rs.best_params_

    estimators = [
        ('lr', LogisticRegression(max_iter=1000, random_state=random_state)),
        ('xgb', XGBClassifier(
            **best,
            tree_method = 'gpu_hist' if device=='cuda' else 'hist',
            device      = device,
            eval_metric = 'auc',
            random_state= random_state
        ))
    ]
    stack = StackingClassifier(
        estimators       = estimators,
        final_estimator  = XGBClassifier(
            n_estimators = 100,
            tree_method  = 'gpu_hist' if device=='cuda' else 'hist',
            device       = device,
            eval_metric  = 'auc',
            random_state = random_state
        ),
        cv               = CV,
        n_jobs           = -1
    )
    stack.fit(X_train, y_train)
    probs_train = stack.predict_proba(X_train)[:,1]
    auc_stack_train = roc_auc_score(y_train, probs_train)

    # 5. TEST FEATURE EXTRACTION & EVAL
    flat_test = [q for pair in zip(q1_test, q2_test) for q in pair]
    sparse_full_t = get_sparse_csr(flat_test, sparse_model)
    s1_t, s2_t = sparse_full_t[:len(q1_test)], sparse_full_t[len(q1_test):]
    X_sp_test = svd.transform(hstack([abs(s1_t-s2_t), s1_t.multiply(s2_t)]))

    emb_t = dense_model.encode(flat_test,
                               batch_size=64,
                               convert_to_tensor=True,
                               device=device)
    d1t, d2t = emb_t[:len(q1_test)].cpu().numpy(), emb_t[len(q1_test):].cpu().numpy()
    X_dn_test = pca.transform(np.hstack([np.abs(d1t-d2t), d1t*d2t]))
    cosims_test = np.diag(cosine_similarity(d1t, d2t))

    v1_tt, v2_tt = tfidf.transform(q1_test), tfidf.transform(q2_test)
    tfidf_sim_test = cosine_similarity(v1_tt, v2_tt).diagonal()
    jacc_test, lev_test = [], []
    for a,b in zip(q1_test, q2_test):
        t1, t2 = set(word_tokenize(a.lower())), set(word_tokenize(b.lower()))
        jacc_test.append(len(t1&t2)/(len(t1|t2)+1e-8))
        lev_test.append(1 - edit_distance(a,b)/max(len(a),len(b),1))
    jacc_test, lev_test = np.array(jacc_test), np.array(lev_test)

    scores_test = xe_model.predict(list(zip(q1_test, q2_test)), batch_size=32)
    xe_feat_test = (scores_test - scores_test.min())/(scores_test.max()-scores_test.min())
    auc_xe_test = roc_auc_score(y_test, xe_feat_test)

    X_test = np.hstack([
        X_sp_test,
        X_dn_test,
        tfidf_sim_test.reshape(-1,1),
        jacc_test.reshape(-1,1),
        lev_test.reshape(-1,1),
        xe_feat_test.reshape(-1,1),
    ])
    probs_test = stack.predict_proba(X_test)[:,1]
    auc_stack_test = roc_auc_score(y_test, probs_test)

    # Save TEST diagnostics
    pd.Series(y_test).value_counts().plot.bar(title="Test Duplicate vs Non-dup")
    save_fig("test_class_balance.png")

    for name, lst in zip(["q1","q2"], [q1_test, q2_test]):
        plt.hist([len(q) for q in lst], bins=30)
        plt.title(f"Test {name} lengths")
        save_fig(f"test_len_{name}.png")

    with open(os.path.join(output_dir, "test_splade_density.txt"), "w") as f:
        f.write(f"{sparse_full_t.nnz/(sparse_full_t.shape[0]*sparse_full_t.shape[1]):.6f}")

    # sparsity pattern
    r_t, c_t = sparse_full_t.nonzero()
    sel_t = np.random.rand(len(r_t))<0.01
    plt.scatter(c_t[sel_t], r_t[sel_t], s=0.5, alpha=0.1)
    plt.title("Test SPLADE 1% sparsity")
    save_fig("test_splade_pattern.png")
    plt.hist(c_t, bins=1000)
    plt.title("Test Active Vocab IDs")
    save_fig("test_splade_vocab_hist.png")
    plt.plot(np.cumsum(svd.explained_variance_ratio_))
    plt.title("Test Sparse SVD var")
    save_fig("test_svd_variance.png")

    plt.hist(cosims_test, bins=30)
    plt.title("Test SBERT cosine sim")
    save_fig("test_cosine_sim.png")
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.title("Test PCA var")
    save_fig("test_pca_variance.png")

    for arr,name in [(tfidf_sim_test,"tfidf"), (jacc_test,"jaccard"), (lev_test,"levenshtein")]:
        plt.hist(arr, bins=30)
        plt.title(f"Test {name} sim")
        save_fig(f"test_{name}_hist.png")

    plt.hist(scores_test, bins=30)
    plt.title("Test XE raw")
    save_fig("test_xe_raw.png")
    neg_t, pos_t = xe_feat_test[y_test==0], xe_feat_test[y_test==1]
    plt.hist([neg_t,pos_t], bins=30, stacked=True)
    plt.title("Test XE by label")
    save_fig("test_xe_by_label.png")

    return {
        'stack_model':    stack,
        'auc_xe_train':   auc_xe_train,
        'auc_stack_train':auc_stack_train,
        'auc_xe_test':    auc_xe_test,
        'auc_stack_test': auc_stack_test,
        'X_train':        X_train,
        'X_test':         X_test
    }


In [70]:
trainNormalized.shape

(4999, 14)

In [71]:
# from duplicate_question_pipeline import run_duplicate_pipeline

df = trainNormalized.copy()
# df = df.sample(n=5000, random_state=42).reset_index(drop=True)


# Full-text run
X_full = df['question1'].astype(str).tolist()
Y = df['is_duplicate'].values
X2_full = df['question2'].astype(str).tolist()

results_full = run_duplicate_pipeline(X_full, X2_full, Y, device='cuda', output_dir='results_full')

# Token-based run
tokens1 = df['tokens1'].apply(lambda t: ' '.join(t)).tolist()
tokens2 = df['tokens2'].apply(lambda t: ' '.join(t)).tolist()
results_tokens = run_duplicate_pipeline(tokens1, tokens2, Y, device='cuda', output_dir='results_tokens')

# Noun-based run
nouns1 = df['nouns1'].apply(lambda n: ' '.join(n)).tolist()
nouns2 = df['nouns2'].apply(lambda n: ' '.join(n)).tolist()
results_nouns = run_duplicate_pipeline(nouns1, nouns2, Y, device='cuda', output_dir='results_nouns')


Sparse encode:   0%|          | 0/63 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

Sparse encode:   0%|          | 0/16 [00:00<?, ?it/s]

Sparse encode:   0%|          | 0/63 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

Sparse encode:   0%|          | 0/16 [00:00<?, ?it/s]

Sparse encode:   0%|          | 0/63 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shap

Sparse encode:   0%|          | 0/16 [00:00<?, ?it/s]

In [73]:
# Print function to display results of train and test AUCs
def print_results(results, name):
    print(f"{name} Train AUC (XE): {results['auc_xe_train']:.4f}")
    print(f"{name} Train AUC (Stack): {results['auc_stack_train']:.4f}")
    print(f"{name} Test AUC (XE): {results['auc_xe_test']:.4f}")
    print(f"{name} Test AUC (Stack): {results['auc_stack_test']:.4f}")


print_results(results_full, "Full-text")
print_results(results_tokens, "Tokens")
print_results(results_nouns, "Nouns")

Full-text Train AUC (XE): 0.9733
Full-text Train AUC (Stack): 0.9820
Full-text Test AUC (XE): 0.9635
Full-text Test AUC (Stack): 0.9515
Tokens Train AUC (XE): 0.7958
Tokens Train AUC (Stack): 0.8762
Tokens Test AUC (XE): 0.7886
Tokens Test AUC (Stack): 0.7684
Nouns Train AUC (XE): 0.6812
Nouns Train AUC (Stack): 0.8016
Nouns Test AUC (XE): 0.7117
Nouns Test AUC (Stack): 0.6604
