# Depression Classification from texts using BERT

Utilizing BERT networks for binary text classification. Each individual message or utterance within these dialogues is associated with one of two labels: "depressed" or "not depressed."

## Workflow:
1. Import Data
2. Data Preprocessing
3. BERT Model Building
4. Training and Validation
5. Model Evalutaion and Testing
6. Saving The Model

**Use google colab for a free GPU**

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
Col

# Importing Librabries

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Depression Dataset/Depression_Detection.csv')


In [None]:
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification
nltk.download('punkt')
import re
!pip install autocorrect
from autocorrect import Speller
!pip install unidecode
import unidecode
!pip install contractions
import contractions
from string import punctuation

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=dcc8e4f1ac3e52125a849e35739c930124c2ea80df45f582297383e92790473e
  Stored in directory: /root/.cache/pip/wheels/b5/7b/6d/b76b29ce11ff8e2521c8c7dd0e5bfee4fb1789d76193124343
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Su

# Preprocessing

In [None]:
# Text Preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):


    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    html_pattern = r'<.*?>'
    text = re.sub(pattern=html_pattern, repl=' ', string=text)

    url_pattern = r'https?://\S+|www\.\S+'
    text = re.sub(pattern=url_pattern, repl=' ', string=text)

    # numbers
    number_pattern = r'\d+'
    text = re.sub(pattern=number_pattern, repl=' ', string=text)

    # unidecode
    text = unidecode.unidecode(text)

    # Expanding Contractions
    text = contractions.fix(text)

    # remove punctutation
    text = text.translate(str.maketrans('', '', punctuation))

    # removing single characters
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    text = re.sub(pattern=single_char_pattern, repl=" ", string=text)

    # Extra spaces
    space_pattern = r'\s+'
    text = re.sub(pattern=space_pattern, repl=" ", string=text)

    # Tokenize and remove stopwords
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    # Join the words back into a single string
    text = ' '.join(words)

    return text
df['text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# # Encoding the labels
import pandas as pd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()  # Creates an instance of LabelEncoder.
df['class'] = label_encoder.fit_transform(df['class'])  # Encodes the 'class' column in the DataFrame using label encoding.


In [None]:
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizer for BERT model

In [None]:
# Initialize BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['class'].unique()))

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and pad the training and testing data
train_tokens = tokenizer(list(train_df['text']), padding=True, truncation=True, return_tensors='pt', max_length=100)
test_tokens = tokenizer(list(test_df['text']), padding=True, truncation=True, return_tensors='pt', max_length=100)


In [None]:
# Assuming train_tokens is a dictionary with keys 'input_ids', 'attention_mask', and 'token_type_ids'
top_5 = {
    'input_ids': train_tokens['input_ids'][0],
    'attention_mask': train_tokens['attention_mask'][0],
    'token_type_ids': train_tokens['token_type_ids'][0]
}

print(top_5)
print(top_5["token_type_ids"].size())


{'input_ids': tensor([  101,  2183,  3102,  2574,  2072,  2113,  2113,  2303,  2568,  3294,
        26351,  6427,  2183, 10797,  2172,  2130,  4167,  7210,  2145,  8239,
         5926,  3242,  8239, 19029,  3064,  3233,  3233,  7987,  5243,  2618,
         2117,  2936,  3233,  7567,  2303,  4167,  8239,  4301,  2342,  2619,
         3531,  2831,  2619,  3531,  2831,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,text,class
103752,155958,going kill sooni know know body mind completel...,0
208014,312013,using sub diary day today great school stuff b...,1
220765,331077,else supposed got serious problem years back b...,0
116840,175538,hey yes need help get new subreddit ground sub...,1
149067,223708,non trans gender dysphoria like menstruation e...,1


In [None]:
# Convert labels to PyTorch tensors
train_labels = torch.tensor(list(train_df['class']))
test_labels = torch.tensor(list(test_df['class']))

In [None]:
# Create DataLoader for training and testing data
train_data = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], train_labels)
test_data = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], test_labels)


In [None]:
batch_size =32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
# Training loop (you can use your preferred training method)
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Model Implementation
Here we are running only 1 epoch due to constraints on hardware(GPU) resources

In [None]:
# Train the model on your training data
num_epochs = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
for epoch in range(num_epochs):
    bert_model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Evaluation
bert_model.eval()

Epoch 1, Loss: 0.13095106706236703


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate classification metrics
print(classification_report(all_labels, all_preds))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96     23128
           1       0.97      0.95      0.96     23287

    accuracy                           0.96     46415
   macro avg       0.96      0.96      0.96     46415
weighted avg       0.96      0.96      0.96     46415



# Saving the bert model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
bert_model.save_pretrained("/content/drive/MyDrive/Depression Detection/Bert_model")

# BERT IMPLEMENTATION on user input

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install transformers

In [None]:
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification
nltk.download('punkt')
import re
!pip install autocorrect
from autocorrect import Speller
!pip install unidecode
import unidecode
!pip install contractions
import contractions
from string import punctuation

In [None]:
# Text Preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):


    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    html_pattern = r'<.*?>'
    text = re.sub(pattern=html_pattern, repl=' ', string=text)

    url_pattern = r'https?://\S+|www\.\S+'
    text = re.sub(pattern=url_pattern, repl=' ', string=text)

    # numbers
    number_pattern = r'\d+'
    text = re.sub(pattern=number_pattern, repl=' ', string=text)

    # unidecode
    text = unidecode.unidecode(text)

    # Expanding Contractions
    text = contractions.fix(text)

    # remove punctutation
    text = text.translate(str.maketrans('', '', punctuation))

    # removing single characters
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    text = re.sub(pattern=single_char_pattern, repl=" ", string=text)

    # Extra spaces
    space_pattern = r'\s+'
    text = re.sub(pattern=space_pattern, repl=" ", string=text)

    # Tokenize and remove stopwords
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    # Join the words back into a single string
    text = ' '.join(words)

    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Importing model and tokenizer

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Define the path to the saved transformer model on Google Drive
model_path = '/content/drive/MyDrive/Depression Dataset/Bert_model'
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
user_input = "I am feeling lonely and sad "
user_input=preprocess_text(user_input)

In [None]:
# Initialize BERT model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Choose the appropriate device
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
# Move the model to the selected device
model.to(device)

# Tokenize the user input
user_tokens = tokenizer(user_input, padding=True, truncation=True, return_tensors='pt', max_length=100)

# Ensure that user_tokens are on the same device as the model
user_tokens = {key: value.to(device) for key, value in user_tokens.items()}

# Inference
with torch.no_grad():
    # Forward pass through the model
    outputs = model(input_ids=user_tokens['input_ids'], attention_mask=user_tokens['attention_mask'])
    logits = outputs.logits

# Interpret the model's output
predicted_class = torch.argmax(logits, dim=1).item()

# Get the corresponding label from the encoder
if predicted_class==0:
  print("depressed")
else:
  print("not depressed")


not depressed


**END**