## Import libraries

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
import numpy as np
import pandas as pd




## Load Data

In [2]:
# ---------------------------
# Load data
# ---------------------------
df = pd.read_csv('cleaned_teen_mental_health_data.csv')
texts = df['user_input']
labels = df['negative_self_talk'].values

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# ---------------------------
# Load SBERT and compute embeddings
# ---------------------------
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_emb = sbert_model.encode(X_train.tolist(), convert_to_numpy=True, show_progress_bar=True)
X_test_emb  = sbert_model.encode(X_test.tolist(), convert_to_numpy=True, show_progress_bar=True)

# ---------------------------
# Define scikit-learn classifiers
# ---------------------------
sklearn_classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RidgeClassifier": RidgeClassifier(),
    "PassiveAggressive": PassiveAggressiveClassifier(max_iter=1000, C=1.0),
    "LinearSVC": LinearSVC(max_iter=5000, class_weight="balanced"),
    "GaussianNB": GaussianNB()
}

print("=== Scikit-learn Classifiers ===")
for name, clf in sklearn_classifiers.items():
    clf.fit(X_train_emb, y_train)
    y_pred = clf.predict(X_test_emb)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

# ---------------------------
# GRU-based classifier using TensorFlow
# ---------------------------
print("\n=== GRU Classifier ===")
X_train_emb_tf = np.expand_dims(X_train_emb, axis=1)  # GRU expects 3D input: (samples, timesteps, features)
X_test_emb_tf = np.expand_dims(X_test_emb, axis=1)

gru_model = Sequential([
    GRU(64, input_shape=(X_train_emb_tf.shape[1], X_train_emb_tf.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model.fit(X_train_emb_tf, y_train, epochs=10, batch_size=32, validation_split=0.1)

y_pred_gru = (gru_model.predict(X_test_emb_tf) > 0.5).astype(int).flatten()
print("GRU Accuracy:", accuracy_score(y_test, y_pred_gru))
print(classification_report(y_test, y_pred_gru))


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

=== Scikit-learn Classifiers ===

LogisticRegression Accuracy: 0.9483
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        20
           1       1.00      0.92      0.96        38

    accuracy                           0.95        58
   macro avg       0.93      0.96      0.94        58
weighted avg       0.96      0.95      0.95        58


RidgeClassifier Accuracy: 0.9310
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        20
           1       0.95      0.95      0.95        38

    accuracy                           0.93        58
   macro avg       0.92      0.92      0.92        58
weighted avg       0.93      0.93      0.93        58


PassiveAggressive Accuracy: 0.9310
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        20
           1       0.95      0.95      0.95        38

    accuracy                           0.9

  super().__init__(**kwargs)


Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - accuracy: 0.5759 - loss: 0.6896 - val_accuracy: 0.8261 - val_loss: 0.6408
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7261 - loss: 0.6411 - val_accuracy: 0.8696 - val_loss: 0.5741
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7746 - loss: 0.5861 - val_accuracy: 0.9130 - val_loss: 0.5003
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.8697 - loss: 0.5062 - val_accuracy: 0.9130 - val_loss: 0.4179
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9319 - loss: 0.4324 - val_accuracy: 0.9130 - val_loss: 0.3388
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9466 - loss: 0.3521 - val_accuracy: 0.9565 - val_loss: 0.2728
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

Based on classification reports, we choose LogisiticRegression model

In [3]:
# Assume 'sklearn_classifiers' contains your trained classifiers
# Pick the trained LogisticRegression
logreg_clf = sklearn_classifiers['LogisticRegression']

# Test sentences
test_sentences = [
    "I feel like I’m worthless and can’t do anything right.",
    "I’m a failure and I’ll never be good enough.",
]

# Convert sentences to embeddings
test_emb = sbert_model.encode(test_sentences, convert_to_numpy=True)

# Predict labels using the trained LogisticRegression
pred_labels = logreg_clf.predict(test_emb)
print("Predicted labels:", pred_labels)

# Optional: decision scores
if hasattr(logreg_clf, "decision_function"):
    scores = logreg_clf.decision_function(test_emb)
    print("Decision scores:", scores)


Predicted labels: [1 1]
Decision scores: [2.43238029 1.32172756]


## Saving the model

In [4]:
import joblib
from sentence_transformers import SentenceTransformer
import numpy as np

class SBERTClassifier:
    def __init__(self, embedder, classifier):
        self.embedder = embedder  # SentenceTransformer model
        self.classifier = classifier  # trained sklearn classifier

    def predict(self, texts):
        embeddings = self.embedder.encode(texts, convert_to_numpy=True)
        return self.classifier.predict(embeddings)

    def decision_function(self, texts):
        if hasattr(self.classifier, "decision_function"):
            embeddings = self.embedder.encode(texts, convert_to_numpy=True)
            return self.classifier.decision_function(embeddings)
        else:
            raise AttributeError("Classifier does not support decision_function")


In [5]:
# Assume 'sbert_model' is your trained SBERT
# and 'clf' is your trained LogisticRegression (or any sklearn classifier)
full_model = SBERTClassifier(embedder=sbert_model, classifier=logreg_clf)


## Load and use

In [6]:
joblib.dump(full_model, "neg_selftalk_full_model.pkl")
print("✅ Full SBERT + classifier model saved!")


✅ Full SBERT + classifier model saved!


In [7]:
# Load the full model
loaded_model = joblib.load("neg_selftalk_full_model.pkl")

# Test with new sentences
test_sentences = [
    "I feel worthless today.",
    "I am excited about my project!"
]

preds = loaded_model.predict(test_sentences)
print("Predicted labels:", preds)

# Optional: decision scores
scores = loaded_model.decision_function(test_sentences)
print("Decision scores:", scores)


Predicted labels: [1 0]
Decision scores: [ 1.07315472 -2.24932245]
