**Checking Which Class Type has how Many Aspects**

In [14]:

import pandas as pd

df = pd.read_csv('/content/final_withaspects.csv')
print(df.head())


   class  id                                               text  text_length  \
0      0   1  thats true freedom speech doomed harassment su...           92   
1      0   2               neener neener time go playground yet           36   
2      0   3  like plastic gun fear armour piercing bullet f...           83   
3      0   4          geology religion werent see rock x formed           41   
4      0   5  well done monty mark first ever honest accurat...           52   

   words_per_sentence  sentiment  polarity  subjectivity  \
0                  13   0.350000  0.350000      0.650000   
1                   6   0.000000  0.000000      0.000000   
2                  13   0.000000  0.000000      0.000000   
3                   7   0.000000  0.000000      0.000000   
4                   9   0.416667  0.416667      0.622222   

                    aspects  
0                        []  
1  ['neener', 'playground']  
2                ['armour']  
3                  ['rock']  
4       

In [15]:
# Function to convert aspect strings to lists
import ast
def convert_aspects(aspect_string):
    if aspect_string == '[]':
        return []
    else:
        return ast.literal_eval(aspect_string)

# Apply the conversion function to the aspects column
df['aspects'] = df['aspects'].apply(convert_aspects)

# Analyze the classes
class_with_aspects = df[df['aspects'].apply(lambda x: len(x) > 0)]['class'].value_counts()
class_without_aspects = df[df['aspects'].apply(lambda x: len(x) == 0)]['class'].value_counts()

# Print the results
print("Classes with aspects:")
print(class_with_aspects)

print("\nClasses without aspects:")
print(class_without_aspects)

Classes with aspects:
class
1    2371
0    1822
Name: count, dtype: int64

Classes without aspects:
class
0    2871
1    2322
Name: count, dtype: int64


In [16]:
df.dtypes

class                   int64
id                      int64
text                   object
text_length             int64
words_per_sentence      int64
sentiment             float64
polarity              float64
subjectivity          float64
aspects                object
dtype: object

In [17]:
# Ensure there are no missing values in 'text' and 'aspect' columns
df = df.dropna(subset=['text', 'aspects'])

# Convert to string type
df['text'] = df['text'].astype(str)
df['aspects'] = df['aspects'].astype(str)

In [18]:
df

Unnamed: 0,class,id,text,text_length,words_per_sentence,sentiment,polarity,subjectivity,aspects
0,0,1,thats true freedom speech doomed harassment su...,92,13,0.350000,0.350000,0.650000,[]
1,0,2,neener neener time go playground yet,36,6,0.000000,0.000000,0.000000,"['neener', 'playground']"
2,0,3,like plastic gun fear armour piercing bullet f...,83,13,0.000000,0.000000,0.000000,['armour']
3,0,4,geology religion werent see rock x formed,41,7,0.000000,0.000000,0.000000,['rock']
4,0,5,well done monty mark first ever honest accurat...,52,9,0.416667,0.416667,0.622222,['post']
...,...,...,...,...,...,...,...,...,...
9381,1,1698,tell genius accurately correctly pointing mist...,195,24,0.050000,0.050000,0.566667,[]
9382,1,1699,think good idea public school assume role pare...,446,62,0.068636,0.068636,0.373788,['administrator']
9383,1,1700,settle charlie try think rationally second eve...,180,26,0.000000,0.000000,0.000000,['charlie']
9384,1,1701,vpc ha political agenda fbi like saying believ...,92,15,0.250000,0.250000,0.300000,['coke']


In [19]:
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split


# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and input formatting
def preprocess_data(texts, aspects, tokenizer, max_len=128):
    input_ids, attention_masks = [], []

    for text, aspect in zip(texts, aspects):
        encoded = tokenizer.encode_plus(
            text,
            aspect,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Assuming 'text', 'aspect', and 'sentiment' columns in the dataframe
texts = df['text'].values
aspects = df['aspects'].values
labels = df['class'].values

# Split the data
train_texts, val_texts, train_aspects, val_aspects, train_labels, val_labels = train_test_split(texts, aspects, labels, test_size=0.2, random_state=42)

train_input_ids, train_attention_masks = preprocess_data(train_texts, train_aspects, tokenizer)
val_input_ids, val_attention_masks = preprocess_data(val_texts, val_aspects, tokenizer)

train_labels = torch.tensor(train_labels, dtype=torch.float32)
val_labels = torch.tensor(val_labels, dtype=torch.float32)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [20]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

batch_size = 32

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


In [21]:
import torch.nn as nn
from transformers import BertModel

class BERT_RNN(nn.Module):
    def __init__(self, bert, hidden_dim=128, output_dim=1, n_layers=1, bidirectional=True):
        super(BERT_RNN, self).__init__()
        self.bert = bert
        self.rnn = nn.LSTM(bert.config.hidden_size, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs[0]  # BERT's output

        rnn_output, _ = self.rnn(cls_output)
        final_output = self.fc(rnn_output[:, -1, :])
        return self.sigmoid(final_output)

# Load pre-trained BERT
bert = BertModel.from_pretrained('bert-base-uncased')
model = BERT_RNN(bert)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



BERT_RNN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [22]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        optimizer.zero_grad()

        outputs = model(b_input_ids, b_attention_mask)
        loss = criterion(outputs.view(-1), b_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

            outputs = model(b_input_ids, b_attention_mask)
            loss = criterion(outputs.view(-1), b_labels)

            total_loss += loss.item()

            all_preds.append(outputs.cpu().numpy())
            all_labels.append(b_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return avg_loss, all_preds, all_labels


In [23]:
epochs = 3

for epoch in range(epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}')


Epoch 1, Train Loss: 0.573321167078424
Epoch 2, Train Loss: 0.4417736361635492
Epoch 3, Train Loss: 0.28004726813511643


In [24]:
model_save_path = 'bert_rnn_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to bert_rnn_model.pth


In [27]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Evaluate the Model and Compute Metrics
val_loss, val_preds, val_labels = evaluate(model, val_dataloader, criterion, device)

def compute_metrics(preds, labels):
    pred_flat = np.round(preds).flatten()
    labels_flat = labels.flatten()
    accuracy = accuracy_score(labels_flat, pred_flat)
    precision = precision_score(labels_flat, pred_flat)
    recall = recall_score(labels_flat, pred_flat)
    f1 = f1_score(labels_flat, pred_flat)
    roc_auc = roc_auc_score(labels_flat, preds)
    conf_matrix = confusion_matrix(labels_flat, pred_flat)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'conf_matrix': conf_matrix
    }

metrics = compute_metrics(val_preds, val_labels)

print(f"Validation Loss: {val_loss}")
print(f"Accuracy: {metrics['accuracy']}")
print(f"Precision: {metrics['precision']}")
print(f"Recall: {metrics['recall']}")
print(f"F1 Score: {metrics['f1']}")
print(f"ROC AUC: {metrics['roc_auc']}")
print(f"Confusion Matrix:\n {metrics['conf_matrix']}")

Validation Loss: 0.6105527928320028
Accuracy: 0.7438764643237487
Precision: 0.7931034482758621
Recall: 0.6606382978723404
F1 Score: 0.7208357515960535
ROC AUC: 0.8289559043687339
Confusion Matrix:
 [[776 162]
 [319 621]]
