In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [34]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Load your data
# Assuming you have a DataFrame called 'data' with columns: 'text', 'Class A', 'Class B', 'Class C', ...
data = pd.read_excel('/kaggle/input/prepro/manipulated_data.xlsx')

# Define the model and tokenizer
model_name = 'sampathlonka/San-BERT'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(data.columns) - 1)

# Split the data into texts and labels
texts = data['Text'].tolist()
labels = data.iloc[:, 1:].values.tolist()

# Define dataset and dataloader
dataset = CustomDataset(texts, labels, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
model.train()
epochs = 3
for epoch in range(epochs):
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions.extend(torch.sigmoid(logits).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert probabilities to binary predictions
threshold = 0.5
binary_predictions = [[1 if p >= threshold else 0 for p in pred] for pred in predictions]

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, binary_predictions)
precision = precision_score(true_labels, binary_predictions, average='micro')
recall = recall_score(true_labels, binary_predictions, average='micro')
f1 = f1_score(true_labels, binary_predictions, average='micro')
classification_rep = classification_report(true_labels, binary_predictions, target_names=data.columns[1:])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Classification Report:\n", classification_rep)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sampathlonka/San-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.0009699321047526673
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Classification Report:
                                                                                                            precision    recall  f1-score   support

                                                    nirveda - weeping, sighing,indifference,dicouragement       0.00      0.00      0.00        47
                                                                                           glani - guilty       0.00      0.00      0.00         3
                                                                             sanka - doubt (apprehension)       0.00      0.00      0.00        20
                                                                            asuya/irsya - jealousy (envy)       0.00      0.00      0.00        10
                                                                            mada - madness (intoxication)       0.00      0.00      0.00         3
                    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
next(iter(dataloader))

{'input_ids': tensor([[    2,  2305, 11716,  ...,     0,     0,     0],
         [    2, 25308,  3085,  ...,     0,     0,     0],
         [    2, 15001, 15014,  ...,     0,     0,     0],
         ...,
         [    2,  2798, 11073,  ...,     0,     0,     0],
         [    2,  2068,  8354,  ...,     0,     0,     0],
         [    2,  2296,  6346,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [41]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load your data
# Assuming you have a DataFrame called 'data' with columns: 'text', 'Class A', 'Class B', 'Class C', ...
data = pd.read_excel('/kaggle/input/prepro/manipulated_data.xlsx')

# Define the model and tokenizer
model_name = 'sampathlonka/San-BERT'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(data.columns) - 1)

# Split the data into texts and labels
texts = data['Text'].tolist()
labels = data.iloc[:, 1:].values.argmax(axis=1)  # Convert one-hot encoded labels to single integers

# Define dataset and dataloader
dataset = CustomDataset(texts, labels, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
epochs = 20
for epoch in range(epochs):
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='micro')
recall = recall_score(true_labels, predictions, average='micro')
f1 = f1_score(true_labels, predictions, average='micro')
classification_rep = classification_report(true_labels, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Classification Report:\n", classification_rep)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sampathlonka/San-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.18913676042677013
Precision: 0.18913676042677013
Recall: 0.18913676042677013
F1-Score: 0.1891367604267701
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        26
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00        23
          10       0.00      0.00      0.00         6
          11       0.00      0.00      0.00        12
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
tokenizer = AutoTokenizer.from_pretrained("sampathlonka/San-BERT")

In [11]:
def train(model, dataloader):
    #class_weights = [1.0, 10.0]
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss()
    model.train()
    total_loss = 0.0
    num_batches = len(dataloader)
    
    progress_bar = tqdm(enumerate(dataloader), total=num_batches, desc="Training")
    
    for batch_idx, (input_ids, attention_mask, labels) in progress_bar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # Update the progress bar description with current loss
        progress_bar.set_description(f"Training Loss: {loss.item():.4f}")
        
    return total_loss / num_batches

In [12]:
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)

            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    confusion = confusion_matrix(true_labels, predictions)

    # Calculate classification report
    report = classification_report(true_labels, predictions)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print("Confusion Matrix:")
    print(confusion)
    print("Classification Report:")
    print(report)

    return accuracy, precision, recall, f1, confusion

In [5]:
'''def max_len_df(df):
    #maximum length of the sentence
    max_len = 0
    num_tokens = 0

    # For every sentence...
    for sent in df.data:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        num_tokens += (len(input_ids)-2)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_ids))

    return max_len'''

'def max_len_df(df):\n    #maximum length of the sentence\n    max_len = 0\n    num_tokens = 0\n\n    # For every sentence...\n    for sent in df.data:\n\n    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n        \n        input_ids = tokenizer.encode(sent, add_special_tokens=True)\n        num_tokens += (len(input_ids)-2)\n\n        # Update the maximum sentence length.\n        max_len = max(max_len, len(input_ids))\n\n    return max_len'

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
# Read the Excel file into a DataFrame
df = pd.read_excel('/kaggle/input/prepro/manipulated_data.xlsx')

not_chosen_columns = ['Text','vyadhi - disease (sickness)','apasmara - forgetfulness (epilepsy/dementedness)']

# Select label columns that are not in the list of not chosen columns
label_columns = [col for col in df.columns if col not in not_chosen_columns]

In [15]:
test_split = 0.2

# Initial train and test split.
train_df, test_df = train_test_split(
    df,
    test_size=test_split,
)
print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 824
Number of rows in test set: 207


In [9]:
df_labels_train = train_df[label_columns]
df_labels_eval = test_df[label_columns]

In [21]:
train_texts=train_df['Text'].tolist()
test_texts=test_df['Text'].tolist()

In [22]:
next(iter(train_texts))

'सययौप्रथमंप्राचींतुल्यःप्राचीनबर्हिषा|अहिताननिलोद्धूतैस्तर्जयन्निवकेतुभिः'

In [25]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame with instances as rows and classes as columns

# Extract features (texts) and labels from the DataFrame
instances = df.iloc[:, 0]  # Assuming the first column contains instance names
features = df.iloc[:, 1:]  # Assuming the rest of the columns are class labels

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(instances,features, test_size=0.2, random_state=42)

# Optionally, you can convert train_labels and test_labels to one-dimensional arrays
train_labels = train_labels.values
test_labels = test_labels.values

# Now you have train_features, test_features, train_labels, and test_labels
# which you can use for training and testing your model

In [26]:
next(iter(train_labels))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0])

In [31]:
def bert_classification(train_df, test_df):
    
    #max_len_train = max_len_df(train_df)
    #max_len_test = max_len_df(test_df)
    
    #max_len = max(max_len_train, max_len_test)

    # Step 2: Build the BERT Model
    model = BertForSequenceClassification.from_pretrained('sampathlonka/San-BERT', num_labels=49)
    model.to(device)

    train_encodings = tokenizer(train_texts, padding=True, return_tensors='pt',max_length=512)# max_length= 'Tokeniser max length'
    train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'],
                                                   torch.tensor(train_labels))
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    test_encodings = tokenizer(test_texts, padding=True, return_tensors='pt',max_length=512)
    test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'],
                                                  torch.tensor(test_labels))
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Step 4: Train the Model
    for epoch in range(15):
        loss = train(model, train_dataloader)
        print(f"Epoch {epoch + 1} - Average Loss: {loss}")

    # Step 5: Evaluate the Model
    evaluate(model, test_dataloader)

In [32]:
bert_classification(train_df, test_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sampathlonka/San-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 0/52 [00:00<?, ?it/s]


ValueError: Expected input batch_size (16) to match target batch_size (784).