In [38]:
# Imports
import os 
from predictionguard import PredictionGuard
import pandas as pd 
import json
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
import numpy as np
from torch import nn
from skorch import NeuralNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, auc, RocCurveDisplay, ConfusionMatrixDisplay, accuracy_score, classification_report
import matplotlib.pyplot as plt
from torch.optim import Adam
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import time
from transformers import BertTokenizer, BertModel
import torch
from xgboost import XGBClassifier

In [2]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [8]:
def get_embedding(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    # Get token embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        
        # Mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled_embedding = sum_embeddings / sum_mask
    
    return mean_pooled_embedding.squeeze().numpy()  # Convert to numpy for easier handling


In [25]:
augmented = pd.read_csv('jomama.csv')
augmented['q21embedded'] = augmented['q21'].apply(get_embedding)
augmented['q22embedded'] = augmented['q22'].apply(get_embedding)

In [114]:
examples = pd.read_csv('examples.csv')
examples['q21embedded'] = examples['q21'].astype(str).apply(get_embedding)
examples['q22embedded'] = examples['q22'].astype(str).apply(get_embedding)
examples.head()

Unnamed: 0,Id,q1,q2,q3,q4,q5,q6,q7,q8,q9,...,q16,q17,q18,q19,q20,q21,q22,label,q21embedded,q22embedded
0,bfe3aa97-eb08-43d9-88a5-34ce93509321,6,6.0,1,2,2,2,2,2,1,...,1,2,1,2,1,"As a surviving child, the weekend seminar with...","""As a surviver chil going to the TAPS Seminar,...",3,"[-0.28119618, 0.29788923, 0.07461245, -0.02227...","[-0.5485113, 0.2337048, 0.45972684, -0.0606981..."
1,3b47ee28-e621-4497-949b-0fa0a59044f7,0,4.0,1,2,2,2,0,1,1,...,0,1,1,2,2,,Como hijo sobreviviente que asistió al Seminar...,3,"[0.14734618, -0.12047348, -0.14350076, -0.0502...","[-0.34511954, 0.06793985, 0.015450314, 0.19882..."
2,e21808f4-2c7f-4cdf-90f5-daa7ccccc843,0,5.0,2,2,2,2,0,4,2,...,2,2,2,2,2,,"""The TAPS Seminar was incredibly valuable for ...",4,"[0.14734618, -0.12047348, -0.14350076, -0.0502...","[-0.38213763, 0.24086426, 0.19717835, -0.05898..."
3,59385c6a-5a11-4348-b0d1-c1f87b56cbd3,5,,0,1,1,1,4,2,1,...,2,3,3,3,1,,"""I attended the TAPS Seminar recently, and it ...",2,"[0.14734618, -0.12047348, -0.14350076, -0.0502...","[-0.33113477, 0.35729206, 0.21894917, -0.08838..."
4,6a769ce6-900b-4490-9d67-6a9e8190b78c,0,0.0,1,2,2,2,2,2,1,...,2,1,1,2,1,"During our unforgettable weekend, we cherishin...",,3,"[-0.24932091, 0.422078, 0.34717786, -0.0116279...","[0.14734618, -0.12047348, -0.14350076, -0.0502..."


In [32]:
augmented['combined_embedding'] = augmented.apply(lambda row: np.concatenate([row['q21embedded'], row['q22embedded']]), axis=1)
examples['combined_embedding'] = examples.apply(lambda row: np.concatenate([row['q21embedded'], row['q22embedded']]), axis=1)

In [110]:
combined = pd.concat([augmented[['label', 'combined_embedding']], examples[['label', 'combined_embedding']][0:10]])
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [111]:
X_train = np.stack(combined['combined_embedding'].values)
Y_train = combined['label'].values

X_test = np.stack(examples['combined_embedding'][10:51].values)
Y_test = examples['label'][10:51]


In [84]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],        # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required to be at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, Y_train)

# Get the best model from the grid search
best_clf = grid_search.best_estimator_

# Predict and evaluate on the test set
Y_pred = best_clf.predict(X_test)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)
print("Test set accuracy:", accuracy_score(Y_test, Y_pred))

Best parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validated accuracy: 0.9230769230769231
Test set accuracy: 0.48


In [112]:
logr = LogisticRegression(max_iter=1000)
logr.fit(X_train, Y_train)

# Predict and evaluate
Y_pred = logr.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, Y_pred))

Accuracy: 0.4


In [113]:
Y_trainxg = Y_train - Y_train.min()  # Shift labels to start from 0
Y_testxg = Y_test - Y_test.min()    # Ensure Y_test is consistent as well

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, Y_trainxg)

Y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(Y_testxg, Y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.45
