In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('balanced_data.csv')

print(df.head(10))

   Unnamed: 0                                          statement   status
0       52329  Dating I (21M) haven’t dated since high school...  Anxiety
1       52662  Feeling dreadful after best friend didn't want...  Anxiety
2       34036  Just a little bit of encouragement Had an itch...  Anxiety
3       34470  Back pain and worst anxiety day in years!! Cry...  Anxiety
4       34306  Does anyone else have a rotating 'portfolio' o...  Anxiety
5       34271  Hypersensitivity I don’t have full blown healt...  Anxiety
6       52927  27 yo with severe anxiety living with parents ...  Anxiety
7       52247  Health Anxiety This past few months I’ve had a...  Anxiety
8       34814  Hanta Virus Hello,\n\nAbout two weeks ago in c...  Anxiety
9       35345  Terrified of ALS/MS or some kind of serious ne...  Anxiety


In [3]:
df.dropna(inplace=True)

In [4]:
labels = df['status'].unique()
label_dict = {label: index for index, label in enumerate(labels)}
df['status'] = df['status'].map(label_dict)

In [5]:
X = df['statement']
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [7]:

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [8]:
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

In [10]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_dict), activation='softmax')
])




In [11]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()

In [12]:
history = model.fit(
    X_train_padded,
    y_train,
    epochs=10,
    validation_data=(X_test_padded, y_test),
    batch_size=64
)

Epoch 1/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 190ms/step - accuracy: 0.2585 - loss: 1.8157 - val_accuracy: 0.3787 - val_loss: 1.4574
Epoch 2/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 191ms/step - accuracy: 0.3913 - loss: 1.4068 - val_accuracy: 0.4343 - val_loss: 1.3148
Epoch 3/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 169ms/step - accuracy: 0.4643 - loss: 1.1713 - val_accuracy: 0.5058 - val_loss: 1.2078
Epoch 4/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 154ms/step - accuracy: 0.6044 - loss: 0.9062 - val_accuracy: 0.5901 - val_loss: 1.0782
Epoch 5/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 158ms/step - accuracy: 0.7130 - loss: 0.7041 - val_accuracy: 0.6524 - val_loss: 0.9812
Epoch 6/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 159ms/step - accuracy: 0.7838 - loss: 0.5328 - val_accuracy: 0.6622 - val_loss: 0.9873
Epoch 7/10

In [14]:
sample_text = ["I'm feeling very anxious and restless."]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequence, maxlen=max_length, padding='post', truncating='post')

# Predict
predictions = model.predict(sample_padded)
predicted_label = np.argmax(predictions)
predicted_class = list(label_dict.keys())[list(label_dict.values()).index(predicted_label)]

print(f"Predicted Sentiment: {predicted_class}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Sentiment: Anxiety


In [24]:
sample_text = ["Internet addiction Anyone addicted to the internet like more then the rest of the world? Do you use it as coping mechanism?"]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequence, maxlen=max_length, padding='post', truncating='post')

# Predict
predictions = model.predict(sample_padded)
predicted_label = np.argmax(predictions)
predicted_class = list(label_dict.keys())[list(label_dict.values()).index(predicted_label)]

print(f"Predicted Sentiment: {predicted_class}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Predicted Sentiment: Personality disorder


In [25]:
import pickle


with open('bert_sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)