### 0 Import necessary packages

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import nltk
from sklearn.neighbors import  KNeighborsClassifier
from tensorflow.keras import regularizers

In [None]:
from google.colab import drive
drive.mount('/content/drive')
nltk.download('stopwords')
nltk.download('punkt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### 1 load data and pre-processing

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Combined_Data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
print(data.count())
null_counts = data.isnull().sum()
print(null_counts)
data = data.dropna()
null_counts = data.isnull().sum()
print(null_counts)
print(data.count())

Unnamed: 0    53043
statement     52681
status        53043
dtype: int64
Unnamed: 0      0
statement     362
status          0
dtype: int64
Unnamed: 0    0
statement     0
status        0
dtype: int64
Unnamed: 0    52681
statement     52681
status        52681
dtype: int64


In [None]:
status_counts = data['status'].value_counts()
print(status_counts)

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64


In [None]:
def label_status(status):
    if status.lower() == 'normal':
        return 0
    elif status.lower() == 'depression':
        return 1
    elif status.lower() == 'suicidal':
        return 2
    else:
        return 3


In [None]:
data['label'] = data['status'].apply(label_status)
label_counts = data['label'].value_counts()

print(label_counts)

label
0    16343
1    15404
2    10652
3    10282
Name: count, dtype: int64


In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_and_stem_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)

    tokens = word_tokenize(text)

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_text'] = data['statement'].apply(clean_and_stem_text)

print(data.head())

   Unnamed: 0                                          statement   status  \
0           0                                         oh my gosh  Anxiety   
1           1  trouble sleeping, confused mind, restless hear...  Anxiety   
2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety   
3           3  I've shifted my focus to something else but I'...  Anxiety   
4           4  I'm restless and restless, it's been a month n...  Anxiety   

   label                                       cleaned_text  
0      3                                            oh gosh  
1      3       troubl sleep confus mind restless heart tune  
2      3  wrong back dear forward doubt stay restless re...  
3      3                  shift focu someth els still worri  
4      3                   restless restless month boy mean  


In [None]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddings_index[word] = np.array(values[1:], dtype='float32')
    return embeddings_index

glove_embeddings = load_glove_embeddings('/content/drive/MyDrive/glove.6B.100d.txt')
embedding_dim = 100

In [None]:
def sentence_to_embedding(sentence, glove_embeddings, embedding_dim):
    embeddings = []
    for word in sentence:
        if word in glove_embeddings:
            embeddings.append(glove_embeddings[word])
        else:
            embeddings.append(np.zeros(embedding_dim))
    if len(embeddings) == 0:
        return np.zeros(embedding_dim)
    return np.mean(embeddings, axis=0)

# create sentence embeddings
data['statement_embedding'] = data['cleaned_text'].apply(lambda x: sentence_to_embedding(x, glove_embeddings, embedding_dim))

print(data[['statement', 'statement_embedding']].head())

                                           statement  \
0                                         oh my gosh   
1  trouble sleeping, confused mind, restless hear...   
2  All wrong, back off dear, forward doubt. Stay ...   
3  I've shifted my focus to something else but I'...   
4  I'm restless and restless, it's been a month n...   

                                 statement_embedding  
0  [-0.16664142906665802, 0.3021071452115263, 0.0...  
1  [-0.2972463634373112, 0.33642129210585897, 0.1...  
2  [-0.34173236536825524, 0.33110409991109163, 0....  
3  [-0.22032406305273375, 0.4153062411668626, 0.1...  
4  [-0.21467593568377197, 0.3199165263213217, 0.1...  


In [None]:
X = np.vstack(data['statement_embedding'].values)
y = data['label']
print(X.shape)

(52681, 100)


In [None]:
embedding_df = pd.DataFrame(X)
embedding_df['label']=y
embedding_df.to_csv("embedding_data.csv",index=False)

In [None]:
# Balance the dataset
class_0 = embedding_df[embedding_df['label'] == 0].head(10500)
class_1 = embedding_df[embedding_df['label'] == 1].head(10500)
class_2 = embedding_df[embedding_df['label'] == 2]
class_3 = embedding_df[embedding_df['label'] == 3]
# Combine the selected classes
data_balanced = pd.concat([class_0, class_1,class_2,class_3])

X_balanced = data_balanced.drop(columns='label').values
y_balanced = data_balanced['label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.3, random_state=432
)

### 2 Baseline model

2.1 SVM

In [None]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4419685796729721
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.41      0.56      3109
         1.0       0.37      0.35      0.36      3180
         2.0       0.34      0.40      0.36      3194
         3.0       0.44      0.61      0.51      2993

    accuracy                           0.44     12476
   macro avg       0.50      0.44      0.45     12476
weighted avg       0.50      0.44      0.45     12476



In [None]:
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.48396922090413597
Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.63      0.72      3109
         1.0       0.39      0.04      0.08      3180
         2.0       0.36      0.67      0.47      3194
         3.0       0.46      0.60      0.52      2993

    accuracy                           0.48     12476
   macro avg       0.51      0.49      0.45     12476
weighted avg       0.51      0.48      0.44     12476



In [None]:
svm_model = SVC(kernel='poly', random_state=432)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.45126643154857327
Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.48      0.62      3109
         1.0       0.28      0.05      0.08      3180
         2.0       0.35      0.69      0.47      3194
         3.0       0.45      0.60      0.51      2993

    accuracy                           0.45     12476
   macro avg       0.49      0.45      0.42     12476
weighted avg       0.49      0.45      0.42     12476



#### 2.2 softmax

In [None]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4478198140429625
Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.53      0.59      3109
         1.0       0.37      0.31      0.34      3180
         2.0       0.34      0.41      0.37      3194
         3.0       0.46      0.55      0.50      2993

    accuracy                           0.45     12476
   macro avg       0.46      0.45      0.45     12476
weighted avg       0.46      0.45      0.45     12476



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 2.3 KNN

In [None]:

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4420487335684514
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.57      0.67      3109
         1.0       0.34      0.48      0.40      3180
         2.0       0.32      0.33      0.33      3194
         3.0       0.46      0.38      0.42      2993

    accuracy                           0.44     12476
   macro avg       0.48      0.44      0.45     12476
weighted avg       0.48      0.44      0.45     12476



### 3 deep learning

#### 3.0 reprocess data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


class_0 = data[data['label'] == 0].head(10500)
class_1 = data[data['label'] == 1].head(10500)
class_2 = data[data['label'] == 2]
class_3 = data[data['label'] == 3]
# Combine the selected classes
data= pd.concat([class_0, class_1,class_2,class_3])

statements = data['cleaned_text'].values
labels = data['label'].values

# Tokenize the statements
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(statements)
sequences = tokenizer.texts_to_sequences(statements)

# Pad sequences
max_sequence_length = 1000
X = pad_sequences(sequences, maxlen=max_sequence_length)

from tensorflow.keras.utils import to_categorical
y = to_categorical(labels, num_classes=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=labels)


In [None]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file_path = '/content/drive/MyDrive/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)

# Prepare embedding matrix
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


#### 3.1 bi-LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model1 = Sequential()
model1.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))


model1.add(Bidirectional(LSTM(128, return_sequences=False)))
model1.add(Dropout(0.5))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(4, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model1.summary()




In [None]:
history = model1.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, verbose=2)


Epoch 1/20
420/420 - 45s - 106ms/step - accuracy: 0.5758 - loss: 0.9610 - val_accuracy: 0.6605 - val_loss: 0.8048
Epoch 2/20
420/420 - 37s - 89ms/step - accuracy: 0.6721 - loss: 0.7990 - val_accuracy: 0.6844 - val_loss: 0.7698
Epoch 3/20
420/420 - 41s - 98ms/step - accuracy: 0.6943 - loss: 0.7393 - val_accuracy: 0.7006 - val_loss: 0.7247
Epoch 4/20
420/420 - 41s - 96ms/step - accuracy: 0.6772 - loss: 0.7998 - val_accuracy: 0.5849 - val_loss: 1.1366
Epoch 5/20
420/420 - 42s - 99ms/step - accuracy: 0.6520 - loss: 0.8451 - val_accuracy: 0.6872 - val_loss: 0.7469
Epoch 6/20
420/420 - 38s - 89ms/step - accuracy: 0.7039 - loss: 0.7245 - val_accuracy: 0.7094 - val_loss: 0.7034
Epoch 7/20
420/420 - 41s - 98ms/step - accuracy: 0.7198 - loss: 0.6864 - val_accuracy: 0.7121 - val_loss: 0.7018
Epoch 8/20
420/420 - 41s - 97ms/step - accuracy: 0.7324 - loss: 0.6605 - val_accuracy: 0.7107 - val_loss: 0.6877
Epoch 9/20
420/420 - 41s - 97ms/step - accuracy: 0.7407 - loss: 0.6382 - val_accuracy: 0.7252 -

In [None]:
loss, accuracy = model1.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {accuracy}")

y_pred = model1.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes))


263/263 - 7s - 25ms/step - accuracy: 0.7222 - loss: 0.7261
Test Accuracy: 0.7221891283988953
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      2100
           1       0.63      0.57      0.60      2100
           2       0.68      0.63      0.65      2131
           3       0.72      0.80      0.76      2056

    accuracy                           0.72      8387
   macro avg       0.72      0.72      0.72      8387
weighted avg       0.72      0.72      0.72      8387



#### 3.2 GRU

In [None]:
from tensorflow.keras.layers import GRU

model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(GRU(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(4, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()



In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, verbose=2)

Epoch 1/20
420/420 - 19s - 46ms/step - accuracy: 0.5805 - loss: 1.1942 - val_accuracy: 0.6753 - val_loss: 0.8463
Epoch 2/20
420/420 - 18s - 43ms/step - accuracy: 0.6887 - loss: 0.8032 - val_accuracy: 0.7034 - val_loss: 0.7568
Epoch 3/20
420/420 - 20s - 48ms/step - accuracy: 0.7125 - loss: 0.7386 - val_accuracy: 0.7083 - val_loss: 0.7251
Epoch 4/20
420/420 - 17s - 41ms/step - accuracy: 0.7218 - loss: 0.7035 - val_accuracy: 0.7185 - val_loss: 0.7005
Epoch 5/20
420/420 - 18s - 42ms/step - accuracy: 0.7349 - loss: 0.6737 - val_accuracy: 0.7152 - val_loss: 0.7223
Epoch 6/20
420/420 - 18s - 42ms/step - accuracy: 0.6014 - loss: 1.0065 - val_accuracy: 0.6763 - val_loss: 0.8269
Epoch 7/20
420/420 - 20s - 48ms/step - accuracy: 0.6913 - loss: 0.7902 - val_accuracy: 0.7155 - val_loss: 0.7324
Epoch 8/20
420/420 - 22s - 53ms/step - accuracy: 0.7164 - loss: 0.7285 - val_accuracy: 0.7030 - val_loss: 0.7445
Epoch 9/20
420/420 - 23s - 54ms/step - accuracy: 0.7279 - loss: 0.6935 - val_accuracy: 0.7249 - 

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {accuracy}")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes))


263/263 - 3s - 13ms/step - accuracy: 0.7284 - loss: 0.7234
Test Accuracy: 0.7283891439437866
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      2100
           1       0.67      0.52      0.59      2100
           2       0.63      0.74      0.68      2131
           3       0.78      0.76      0.77      2056

    accuracy                           0.73      8387
   macro avg       0.73      0.73      0.73      8387
weighted avg       0.73      0.73      0.72      8387

