In [1]:
import numpy as np
import pandas as pd
import torch
import joblib
from scipy import sparse
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load dataframes
train_df = pd.read_csv(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\cleaned_train.csv")
texts = train_df['text'].fillna("").tolist()
labels = train_df['target'].values
test_df = pd.read_csv(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\cleaned_test.csv")
test_texts = test_df['text'].fillna("").tolist()


In [7]:
# Load Sklearn models
lr_model = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\logistic_regression_model.pkl")
rf_model = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\random_forest_model.pkl")
xgb_model = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\gradient_boosted_model.pkl")
svm_model = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\support_vector_machine_model.pkl")
lrbert_model = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\lr_bert_model.pkl")
print("Sklearn models loaded.")

# Load fine-tuned BERT model and tokenizer
bert_model = DistilBertForSequenceClassification.from_pretrained(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\finetuned_bert")
bert_tokenizer = DistilBertTokenizerFast.from_pretrained(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\finetuned_bert")
print("BERT model loaded.")

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)
bert_model.eval()

Sklearn models loaded.
BERT model loaded.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
# Load tfidf vectorizer
tfidf_vectorizer = joblib.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\tfidf_vectorizer.pkl")  

w2v_model = Word2Vec.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\word2vec_model.model")
idf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Load features used with xgboost
tfidf_train = sparse.load_npz(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\tfidf_train_matrix.npz")
tfidf_test = sparse.load_npz(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\tfidf_test_matrix.npz")
w2v_train = np.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\w2v_tfidf_train_features.npy")
w2v_test = np.load(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\w2v_tfidf_test_features.npy")
# Stack features for xgboost
full_train_features = np.hstack([tfidf_train.toarray(), w2v_train])
full_test_features = np.hstack([tfidf_test.toarray(), w2v_test])

# Function to get weighted W2V
def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)

In [None]:
# Prepare StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
meta_features = np.zeros((len(train_df), 6))

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"Fold {fold+1}/{n_splits}")
    
    X_val_texts = [texts[i] for i in val_idx]
    X_val_tfidf = tfidf_vectorizer.transform(X_val_texts)
    val_full_features = full_train_features[val_idx]

    meta_features[val_idx, 0] = lr_model.predict_proba(X_val_tfidf)[:, 1]
    meta_features[val_idx, 1] = rf_model.predict_proba(X_val_tfidf)[:, 1]
    meta_features[val_idx, 2] = xgb_model.predict_proba(val_full_features)[:, 1]
    meta_features[val_idx, 3] = svm_model.decision_function(X_val_tfidf)

    inputs = bert_tokenizer(X_val_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        last_hidden_state = outputs.hidden_states[-1]
        attention_mask = inputs['attention_mask']

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = (sum_embeddings / sum_mask).cpu().numpy()

        bert_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

    meta_features[val_idx, 4] = lrbert_model.predict_proba(mean_embeddings)[:, 1]
    meta_features[val_idx, 5] = bert_probs

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [22]:
# Train meta-model
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(meta_features, labels)

In [23]:
# Evaluate meta-model
oof_preds = meta_model.predict(meta_features)

print("\n== Meta-Model Evaluation ==")
print("Accuracy:", accuracy_score(labels, oof_preds))
print("F1 Score:", f1_score(labels, oof_preds))
print("\nClassification Report:\n", classification_report(labels, oof_preds))
print("\nConfusion Matrix:\n", confusion_matrix(labels, oof_preds))


== Meta-Model Evaluation ==
Accuracy: 0.9403651648495994
F1 Score: 0.9295905707196029

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      4342
           1       0.94      0.92      0.93      3271

    accuracy                           0.94      7613
   macro avg       0.94      0.94      0.94      7613
weighted avg       0.94      0.94      0.94      7613


Confusion Matrix:
 [[4162  180]
 [ 274 2997]]


In [None]:
# Predict on Test Set
batch_size = 64

test_encodings = bert_tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

all_logits = []
all_cls_embeddings = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = [b.to(device, non_blocking=True) for b in batch]
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        logits = outputs.logits
        all_logits.append(logits.cpu())
        last_hidden_state = outputs.hidden_states[-1]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = (sum_embeddings / sum_mask).cpu()
        all_cls_embeddings.append(mean_embeddings)

# Stack all batches together
test_logits = torch.cat(all_logits)
test_cls_embeddings = torch.cat(all_cls_embeddings)

# Prepare test_meta_features
test_meta_features = np.zeros((len(test_df), 6))

X_test_tfidf = tfidf_vectorizer.transform(test_texts)

test_meta_features[:, 0] = lr_model.predict_proba(X_test_tfidf)[:, 1]
test_meta_features[:, 1] = rf_model.predict_proba(X_test_tfidf)[:, 1]
test_meta_features[:, 2] = xgb_model.predict_proba(full_test_features)[:, 1]
test_meta_features[:, 3] = svm_model.decision_function(X_test_tfidf)
test_meta_features[:, 4] = lrbert_model.predict_proba(test_cls_embeddings.numpy())[:, 1]
test_meta_features[:, 5] = torch.softmax(test_logits, dim=1)[:, 1].numpy()

# Meta-model predictions on test set
test_preds = meta_model.predict(test_meta_features)

submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

submission.to_csv("modular_stacking.csv", index=False)
print("Submission file saved: modular_stacking.csv")


Submission file saved: modular_stacking.csv
