In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('reddit.csv')

In [6]:
df

Unnamed: 0.1,Unnamed: 0,title,text,combined_text,subreddit,label
0,0,read first think someone know narcissist!,narcissists people think narcissists allowed p...,read first think someone know narcissist! narc...,narcissism,1
1,1,biweekly ask narcissist thread visitors/codepe...,"thread ask questions narcissists, know cluster...",biweekly ask narcissist thread visitorscodepen...,narcissism,1
2,2,relationship advice,i’m getting really annoyed people here; think ...,relationship advice im getting really annoyed ...,narcissism,1
3,3,leave someone might covert narcissistic traits?,leave someone might covert narcissistic traits...,leave someone might covert narcissistic traits...,narcissism,1
4,4,trust change?,husband showed strong narcissistic traits past...,trust change? husband showed strong narcissist...,narcissism,1
...,...,...,...,...,...,...
595,595,"hi, new really sure start","i’ve really painful relationship long time, i’...","hi, new really sure start ive really painful r...",bpd,0
596,596,bpd autism comorbid,anybody diagnosed good idea look like together...,bpd autism comorbid anybody diagnosed good ide...,bpd,0
597,597,get rid fp?,"context, won’t say age, think fp one online fr...","get rid fp? context, wont say age, think fp on...",bpd,0
598,598,people try tell you’re misdiagnosed?,i’ve people start telling either got misdiagno...,people try tell youre misdiagnosed? ive people...,bpd,0


In [7]:
df = df.drop(columns=['Unnamed: 0'])

In [8]:
df

Unnamed: 0,title,text,combined_text,subreddit,label
0,read first think someone know narcissist!,narcissists people think narcissists allowed p...,read first think someone know narcissist! narc...,narcissism,1
1,biweekly ask narcissist thread visitors/codepe...,"thread ask questions narcissists, know cluster...",biweekly ask narcissist thread visitorscodepen...,narcissism,1
2,relationship advice,i’m getting really annoyed people here; think ...,relationship advice im getting really annoyed ...,narcissism,1
3,leave someone might covert narcissistic traits?,leave someone might covert narcissistic traits...,leave someone might covert narcissistic traits...,narcissism,1
4,trust change?,husband showed strong narcissistic traits past...,trust change? husband showed strong narcissist...,narcissism,1
...,...,...,...,...,...
595,"hi, new really sure start","i’ve really painful relationship long time, i’...","hi, new really sure start ive really painful r...",bpd,0
596,bpd autism comorbid,anybody diagnosed good idea look like together...,bpd autism comorbid anybody diagnosed good ide...,bpd,0
597,get rid fp?,"context, won’t say age, think fp one online fr...","get rid fp? context, wont say age, think fp on...",bpd,0
598,people try tell you’re misdiagnosed?,i’ve people start telling either got misdiagno...,people try tell youre misdiagnosed? ive people...,bpd,0


In [9]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
import random
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [47]:
X = df['combined_text']
y = df['label']

In [48]:
from tensorflow.keras.layers import TextVectorization as TV

In [49]:
vocab_size = 5000
tv = TV(max_tokens = 5000, output_mode = 'int')

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [51]:
tv.adapt(X_train)

In [52]:
print(len(tv.get_vocabulary()))

5000


In [53]:
X_train_seq = tv(tf.constant(X_train))
X_val_seq = tv(tf.constant(X_val))

In [54]:
def lstm_model(embedding_dim, lstm_units, dropout_rate, learning_rate):
  model = keras.Sequential([
      layers.Embedding(vocab_size, embedding_dim),
      layers.Bidirectional(LSTM(units=lstm_units, return_sequences=True)),
      layers.Dropout(dropout_rate),
      layers.Bidirectional(LSTM(units=lstm_units // 2)),
      layers.Dropout(rate=dropout_rate),
      layers.Dense(1, activation = 'sigmoid')
  ])

  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model

In [55]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
embedding_dims = [64, 128]
lstm_units_list = [32, 64]
dropout_rates = [0.3, 0.5]
learning_rates = [1e-3, 5e-4]
epochs = 5
batch_size = 32

best_f1 = 0
best_scores = {}
best_config = {}
best_model = None

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

my_callbacks = [
    keras.callbacks.EarlyStopping(
        monitor = 'val_accuracy',
        patience = 3),
    keras.callbacks.ModelCheckpoint(
        filepath = 'best_model.keras',
        monitor = 'val_accuracy',
        save_best_only = True,
        verbose = 1)
]

for emb in embedding_dims:
  for unit in lstm_units_list:
    for dropout in dropout_rates:
      for learn in learning_rates:
        print(f"\n Trying emb={emb}, lstm={unit}, drop={dropout}, lr={learn}")
        model = lstm_model(emb, unit, dropout, learn)
        history = model.fit(
            x = X_train_seq,
            y = y_train,
            batch_size = 32,
            epochs = 100,
            class_weight=class_weights,
            validation_data = (X_val_seq, y_val),
            callbacks = my_callbacks)

        y_pred = (model.predict(X_val_seq) > 0.3).astype("int32")
        f1 = f1_score(y_val, y_pred)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred)
        rec = recall_score(y_val, y_pred)


        print(f"F1 Score: {f1:.4f}")

        if f1 > best_f1:
           best_f1 = f1
           best_config = {
               'embedding_dim': emb,
               'lstm_units': unit,
               'dropout_rate': dropout,
               'learning_rate': learn
           }
           best_model = model
           best_scores = {
            'f1': f1,
            'accuracy': acc,
            'precision': prec,
            'recall': rec
            }


 Trying emb=64, lstm=32, drop=0.3, lr=0.001
Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.5553 - loss: 0.6846
Epoch 1: val_accuracy improved from -inf to 0.83333, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1s/step - accuracy: 0.5595 - loss: 0.6849 - val_accuracy: 0.8333 - val_loss: 0.6778
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 830ms/step - accuracy: 0.7904 - loss: 0.6555
Epoch 2: val_accuracy improved from 0.83333 to 0.85000, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1s/step - accuracy: 0.7904 - loss: 0.6548 - val_accuracy: 0.8500 - val_loss: 0.5785
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 896ms/step - accuracy: 0.8831 - loss: 0.5063
Epoch 3: val_accuracy improved from 0.85000 to 0.87500, saving model to best_model.keras
[1m15/15[0m [32m━━━━━━━━━━━━━━



[1m3/4[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 180ms/step



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 395ms/step
F1 Score: 0.8571

 Trying emb=64, lstm=32, drop=0.5, lr=0.0005
Epoch 1/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864ms/step - accuracy: 0.5447 - loss: 0.6853
Epoch 1: val_accuracy did not improve from 0.93333
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 972ms/step - accuracy: 0.5495 - loss: 0.6857 - val_accuracy: 0.8417 - val_loss: 0.6809
Epoch 2/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 869ms/step - accuracy: 0.7158 - loss: 0.6601
Epoch 2: val_accuracy did not improve from 0.93333
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 924ms/step - accuracy: 0.7189 - loss: 0.6594 - val_accuracy: 0.8667 - val_loss: 0.6095
Epoch 3/100
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 874ms/step - accuracy: 0.8971 - loss: 0.5320
Epoch 3: val_accuracy did not improve from 0.93333
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [56]:
best_model.summary()

In [57]:
print("Best LSTM Configuration:")
for k, v in best_config.items():
    print(f"{k}: {v}")
print(f"Best F1 Score on Validation Set: {best_f1:.4f}")
print("Metrics for Best Model:")
for k, v in best_scores.items():
    print(f"{k.capitalize():<9}: {v:.4f}")

Best LSTM Configuration:
embedding_dim: 128
lstm_units: 32
dropout_rate: 0.3
learning_rate: 0.0005
Best F1 Score on Validation Set: 0.8780
Metrics for Best Model:
F1       : 0.8780
Accuracy : 0.9167
Precision: 0.8571
Recall   : 0.9000
