<a href="https://colab.research.google.com/github/GeorgeM2000/CDC-SMART-BRFSS-City-and-County-Data-Analysis/blob/main/BERT_Sequence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries and Tools***

In [87]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# ***BERT for Sequence Classification***

In [51]:
text_data = pd.read_excel("Text_Sequence_Dataset.xlsx")

In [52]:
text_data.head()

Unnamed: 0,Sentence,Classification
0,"Good or Better Health, Zero days when physical...",2
1,"Good or Better Health, Zero days when physical...",2
2,"Good or Better Health, Zero days when physical...",2
3,"Good or Better Health, Zero days when physical...",2
4,"Good or Better Health, Zero days when physical...",2


In [53]:
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167583 entries, 0 to 167582
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Sentence        167583 non-null  object
 1   Classification  167583 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


In [54]:
text_data['Classification'] = text_data['Classification'].replace({1: "yes", 2: "no"})

In [55]:
text_data.head()

Unnamed: 0,Sentence,Classification
0,"Good or Better Health, Zero days when physical...",no
1,"Good or Better Health, Zero days when physical...",no
2,"Good or Better Health, Zero days when physical...",no
3,"Good or Better Health, Zero days when physical...",no
4,"Good or Better Health, Zero days when physical...",no


In [56]:
def preprocess_text(text):
    return text.lower()

In [57]:
text_data['Sentence'] = text_data['Sentence'].apply(preprocess_text)

In [58]:
text_data.head()

Unnamed: 0,Sentence,Classification
0,"good or better health, zero days when physical...",no
1,"good or better health, zero days when physical...",no
2,"good or better health, zero days when physical...",no
3,"good or better health, zero days when physical...",no
4,"good or better health, zero days when physical...",no


The dataset is large. We can use undersampling to preserve the minority class and reduce the initial size.

In [64]:
undersampler = RandomUnderSampler(random_state=42)
sentences = text_data.drop(columns=['Classification'])
classes = text_data['Classification']

sentences, classes = undersampler.fit_resample(sentences, classes)
undersampled_text_data = pd.concat([sentences, classes.reset_index(drop=True)], axis=1)

Use ***stratify=text_data['Classification']*** if you didn't use undersampling.

In [80]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(undersampled_text_data['Sentence'], undersampled_text_data['Classification'], test_size=0.20, random_state=42)

In [81]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt')

The ***train_labels - test_labels*** must be transformed from a pandas dataframe to np array of numerical representative classes. So "yes" will be 1, and "no" will be 0.

In [88]:
label_encoder = LabelEncoder()

train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.fit_transform(test_labels)

In [None]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [98]:
class_weights = torch.tensor([1.0, 2.0], dtype=torch.float)  # Assuming class 1 is more important

In [99]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

In [None]:
model.train()
for epoch in range(3):  # Three epochs for faster training
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.2f}')