In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import torch.nn as nn
import logging
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Setting up logging
logging.basicConfig(filename='training_logs_model_T5.txt', level=logging.INFO, format='%(asctime)s - %(message)s')
logging.info("Training started.")

In [4]:
# Dataset Class
class ConversationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)  # Use float for binary classification
        }

In [5]:
# Load Data
data = pd.read_csv("chat_summary_labels.csv")  # Load your CSV file with data
test_texts = data['T5'][:150]  
test_labels = data['labels'][:150]
train_texts = data['T5'][150:]  
train_labels = data['labels'][150:]
# Tokenizer and Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = ConversationDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length=128)
test_dataset = ConversationDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_length=128)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [7]:
print(test_texts, test_labels)

0      Customer is looking for a laptop that can hand...
1      Customers of their digestive health supplement...
2      Sellerman is interested in investing in the st...
3      Salesman, managing director of Financial Plann...
4      Customer is looking for a smartphone with a go...
                             ...                        
145    Salesman is consulting customer about buying a...
146    Salesman, a company that provides a range of h...
147    Salesman, who has been in the tech industry fo...
148    Sellerman is interested in buying a new laptop...
149    Salesman, who has been researching other tech ...
Name: T5, Length: 150, dtype: object 0      0.0
1      1.0
2      0.0
3      0.0
4      0.0
      ... 
145    1.0
146    0.0
147    1.0
148    0.0
149    0.0
Name: labels, Length: 150, dtype: float64


In [11]:
# print(type(train_labels))

In [21]:
class BertWithDropoutAndBatchNorm(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(BertWithDropoutAndBatchNorm, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

        # Batch Normalization and Dropout layers
        self.batch_norm = nn.BatchNorm1d(768)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        bert_outputs = self.bert.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_outputs.last_hidden_state  # Shape: (batch_size, seq_len, 768)
        pooled_output = last_hidden_state[:, 0, :]  # Shape: (batch_size, 768)
        normalized_output = self.batch_norm(pooled_output)
        dropped_out_output = self.dropout(normalized_output)

        # Pass through the classification head
        logits = self.bert.classifier(dropped_out_output)  # Use the existing classification layer
        probs = self.sigmoid(logits)  # Apply sigmoid for binary classification
        return probs
model = BertWithDropoutAndBatchNorm(dropout_rate=0.3).to(device)


# class BertForBinaryClassification(nn.Module):
#     def __init__(self):
#         super(BertForBinaryClassification, self).__init__()
#         self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)  # Binary classification, single output
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
#         probs = self.sigmoid(logits)  # Apply Sigmoid to the logits
#         return probs

# # Initialize model
# model = BertForBinaryClassification()
# model=model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_training_steps = len(train_loader) * 20  # 20 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
criterion = nn.BCELoss()



In [23]:
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    for batch in tqdm(dataloader,"Training"):
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            # 'labels': batch['label'].to(device)
        }
        outputs = model(**inputs)
        loss = criterion(outputs.squeeze(), batch['label'].to(device))  # Squeeze to match label shape
        loss.backward()
        # torch.nn.utils.clip_grad_form_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

        # Calculate accuracy
        preds = (outputs.squeeze() > 0.5).float()  # Get predictions as binary 0 or 1
        correct_preds += (preds == batch['label'].to(device)).sum().item()
        total_preds += len(batch['label'].to(device))

    epoch_loss = total_loss / len(dataloader)
    epoch_acc = correct_preds / total_preds
    return epoch_loss, epoch_acc

In [24]:
# Evaluation Function
from tqdm import tqdm
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in tqdm(dataloader,"Validation"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            outputs = model(**inputs)
            logits = outputs.squeeze()  # Squeeze for binary classification
            val_loss = criterion(outputs.squeeze(), batch['label'].to(device)) 
            preds = (logits > 0.5).cpu().numpy()  # Convert logits to binary (0 or 1)
            predictions.extend(preds)
            true_labels.extend(batch['label'].cpu().numpy())

            # Calculate accuracy
            correct_preds += (preds == batch['label'].cpu().numpy()).sum()
            total_preds += len(batch['label'])

    accuracy = correct_preds / total_preds
    return predictions, true_labels, val_loss, accuracy

In [25]:
# Training Loop
epochs = 15
for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
    test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")
    logging.info(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")

Training: 100%|██████████| 50/50 [00:06<00:00,  8.00it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.64it/s]


Epoch 1: Train Loss = 0.7159, Train Accuracy = 0.5275, Val Loss = 0.7565791606903076, Val Acc = 0.44666666666666666


Training: 100%|██████████| 50/50 [00:06<00:00,  8.00it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.54it/s]


Epoch 2: Train Loss = 0.6648, Train Accuracy = 0.5900, Val Loss = 0.8136405944824219, Val Acc = 0.4866666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.99it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.49it/s]


Epoch 3: Train Loss = 0.6212, Train Accuracy = 0.6725, Val Loss = 0.8064589500427246, Val Acc = 0.47333333333333333


Training: 100%|██████████| 50/50 [00:06<00:00,  7.96it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.38it/s]


Epoch 4: Train Loss = 0.5633, Train Accuracy = 0.7225, Val Loss = 1.104245662689209, Val Acc = 0.4866666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.96it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.41it/s]


Epoch 5: Train Loss = 0.5213, Train Accuracy = 0.7625, Val Loss = 0.896377444267273, Val Acc = 0.5


Training: 100%|██████████| 50/50 [00:06<00:00,  7.93it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.00it/s]


Epoch 6: Train Loss = 0.4855, Train Accuracy = 0.7875, Val Loss = 1.1165916919708252, Val Acc = 0.4866666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.92it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.34it/s]


Epoch 7: Train Loss = 0.4369, Train Accuracy = 0.8250, Val Loss = 1.1096055507659912, Val Acc = 0.49333333333333335


Training: 100%|██████████| 50/50 [00:06<00:00,  7.92it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.21it/s]


Epoch 8: Train Loss = 0.3670, Train Accuracy = 0.8375, Val Loss = 1.1533582210540771, Val Acc = 0.5333333333333333


Training: 100%|██████████| 50/50 [00:06<00:00,  7.91it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.11it/s]


Epoch 9: Train Loss = 0.2995, Train Accuracy = 0.8825, Val Loss = 1.590253233909607, Val Acc = 0.5


Training: 100%|██████████| 50/50 [00:06<00:00,  7.89it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 28.01it/s]


Epoch 10: Train Loss = 0.2573, Train Accuracy = 0.9075, Val Loss = 1.5497071743011475, Val Acc = 0.5066666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.89it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 27.89it/s]


Epoch 11: Train Loss = 0.2139, Train Accuracy = 0.9525, Val Loss = 1.7394745349884033, Val Acc = 0.4533333333333333


Training: 100%|██████████| 50/50 [00:06<00:00,  7.88it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 27.95it/s]


Epoch 12: Train Loss = 0.2158, Train Accuracy = 0.9375, Val Loss = 1.323854923248291, Val Acc = 0.5


Training: 100%|██████████| 50/50 [00:06<00:00,  7.89it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 27.82it/s]


Epoch 13: Train Loss = 0.1479, Train Accuracy = 0.9550, Val Loss = 1.6653919219970703, Val Acc = 0.4866666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.89it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 27.85it/s]


Epoch 14: Train Loss = 0.1639, Train Accuracy = 0.9400, Val Loss = 2.0375962257385254, Val Acc = 0.4666666666666667


Training: 100%|██████████| 50/50 [00:06<00:00,  7.88it/s]
Validation: 100%|██████████| 19/19 [00:00<00:00, 27.96it/s]

Epoch 15: Train Loss = 0.1284, Train Accuracy = 0.9600, Val Loss = 2.17820405960083, Val Acc = 0.4866666666666667





In [27]:
test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)

Validation: 100%|██████████| 19/19 [00:00<00:00, 28.33it/s]


In [28]:
print(test_acc)

0.4866666666666667


In [35]:
# # Evaluation
# logging.info(f"Test Accuracy: {test_acc:.4f}")
# logging.info("Classification Report:")
# logging.info(classification_report(test_labels, test_preds))
# logging.info("ROC-AUC Score: %.4f" % roc_auc_score(test_labels, test_preds))

# # Save Model
# model.save_pretrained("bert_intent_classifier")
# tokenizer.save_pretrained("bert_intent_classifier")

# logging.info("Model saved successfully.")

# # Testing New Inputs
# def predict(model, tokenizer, texts, device):
#     model.eval()
#     encodings = tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
#     input_ids = encodings['input_ids'].to(device)
#     attention_mask = encodings['attention_mask'].to(device)
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
#         probs = outputs.squeeze()  # Apply sigmoid during evaluation
#     return probs.cpu().numpy()  # Return raw probability of class 1 (satisfied)

# new_texts = ["Customer is happy with the support provided."]
# probabilities = predict(model, tokenizer, new_texts, device)
# logging.info("Satisfaction Probabilities: %s" % str(probabilities))

# print("Training complete. Check 'training_logs_model_T5.txt' for detailed logs.")

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import torch.nn as nn
import logging
import numpy as np
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Setting up logging
logging.basicConfig(filename='training_logs.txt', level=logging.INFO, format='%(asctime)s - %(message)s')
logging.info("Training started.")
# Dataset Class
class ConversationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)  # Use float for binary classification
        }
# Load Data
data = pd.read_csv("chat_summary_labels.csv")  # Load your CSV file with data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['T5'], data['labels'], test_size=0.2, random_state=42
)

# Tokenizer and Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = ConversationDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length=128)
test_dataset = ConversationDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_length=128)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)
# Model Class
class BertForBinaryClassification(nn.Module):
    def __init__(self):
        super(BertForBinaryClassification, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)  # Binary classification, single output
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = self.sigmoid(logits)  # Apply Sigmoid to the logits
        return probs

# Initialize model
model = BertForBinaryClassification()
model=model.to(device)
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 20  # 20 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Binary Cross-Entropy Loss
criterion = nn.BCELoss()
# Training Function
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    for batch in tqdm(dataloader,"Training"):
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            # 'labels': batch['label'].to(device)
        }
        outputs = model(**inputs)
        loss = criterion(outputs.squeeze(), batch['label'].to(device))  # Squeeze to match label shape
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

        # Calculate accuracy
        preds = (outputs.squeeze() > 0.5).float()  # Get predictions as binary 0 or 1
        correct_preds += (preds == batch['label'].to(device)).sum().item()
        total_preds += len(batch['label'].to(device))

    epoch_loss = total_loss / len(dataloader)
    epoch_acc = correct_preds / total_preds
    return epoch_loss, epoch_acc
# Evaluation Function
from tqdm import tqdm
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in tqdm(dataloader,"Validation"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            outputs = model(**inputs)
            logits = outputs.squeeze()  # Squeeze for binary classification
            val_loss = criterion(outputs.squeeze(), batch['label'].to(device)) 
            preds = (logits > 0.5).cpu().numpy()  # Convert logits to binary (0 or 1)
            predictions.extend(preds)
            true_labels.extend(batch['label'].cpu().numpy())

            # Calculate accuracy
            correct_preds += (preds == batch['label'].cpu().numpy()).sum()
            total_preds += len(batch['label'])

    accuracy = correct_preds / total_preds
    return predictions, true_labels, val_loss, accuracy
# Training Loop
epochs = 10
for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
    test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")
    logging.info(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")
test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)
print(test_acc)

In [26]:
# import pandas as pd
# import torch
# from torch.utils.data import DataLoader, Dataset
# from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import AdamW, get_scheduler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
# import torch.nn as nn
# import logging
# import numpy as np
# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
# # Setting up logging
# logging.basicConfig(filename='training_logs.txt', level=logging.INFO, format='%(asctime)s - %(message)s')
# logging.info("Training started.")
# # Dataset Class
# class ConversationDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]
#         encoding = self.tokenizer(
#             text, 
#             max_length=self.max_length, 
#             padding='max_length', 
#             truncation=True, 
#             return_tensors="pt"
#         )
#         return {
#             'input_ids': encoding['input_ids'].squeeze(0),
#             'attention_mask': encoding['attention_mask'].squeeze(0),
#             'label': torch.tensor(label, dtype=torch.float)  # Use float for binary classification
#         }
# # Load Data
# data = pd.read_csv("chat_summary_labels.csv")  # Load your CSV file with data
# train_texts, test_texts, train_labels, test_labels = train_test_split(
#     data['T5'], data['labels'], test_size=0.2, random_state=42
# )

# # Tokenizer and Dataset
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# train_dataset = ConversationDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length=128)
# test_dataset = ConversationDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_length=128)

# # DataLoader
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=8)
# # Model Class
# class BertForBinaryClassification(nn.Module):
#     def __init__(self):
#         super(BertForBinaryClassification, self).__init__()
#         self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)  # Binary classification, single output
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
#         probs = self.sigmoid(logits)  # Apply Sigmoid to the logits
#         return probs

# # Initialize model
# model = BertForBinaryClassification()
# model=model.to(device)
# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=5e-5)
# num_training_steps = len(train_loader) * 20  # 20 epochs
# lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# # Binary Cross-Entropy Loss
# criterion = nn.BCELoss()
# # Training Function
# from tqdm import tqdm

# def train_epoch(model, dataloader, optimizer, scheduler, device):
#     model.train()
#     total_loss = 0
#     correct_preds = 0
#     total_preds = 0
#     for batch in tqdm(dataloader,"Training"):
#         optimizer.zero_grad()
#         inputs = {
#             'input_ids': batch['input_ids'].to(device),
#             'attention_mask': batch['attention_mask'].to(device),
#             # 'labels': batch['label'].to(device)
#         }
#         outputs = model(**inputs)
#         loss = criterion(outputs.squeeze(), batch['label'].to(device))  # Squeeze to match label shape
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         total_loss += loss.item()

#         # Calculate accuracy
#         preds = (outputs.squeeze() > 0.5).float()  # Get predictions as binary 0 or 1
#         correct_preds += (preds == batch['label'].to(device)).sum().item()
#         total_preds += len(batch['label'].to(device))

#     epoch_loss = total_loss / len(dataloader)
#     epoch_acc = correct_preds / total_preds
#     return epoch_loss, epoch_acc
# # Evaluation Function
# from tqdm import tqdm
# def evaluate(model, dataloader, device):
#     model.eval()
#     predictions, true_labels = [], []
#     correct_preds = 0
#     total_preds = 0
#     with torch.no_grad():
#         for batch in tqdm(dataloader,"Validation"):
#             inputs = {
#                 'input_ids': batch['input_ids'].to(device),
#                 'attention_mask': batch['attention_mask'].to(device)
#             }
#             outputs = model(**inputs)
#             logits = outputs.squeeze()  # Squeeze for binary classification
#             val_loss = criterion(outputs.squeeze(), batch['label'].to(device)) 
#             preds = (logits > 0.5).cpu().numpy()  # Convert logits to binary (0 or 1)
#             predictions.extend(preds)
#             true_labels.extend(batch['label'].cpu().numpy())

#             # Calculate accuracy
#             correct_preds += (preds == batch['label'].cpu().numpy()).sum()
#             total_preds += len(batch['label'])

#     accuracy = correct_preds / total_preds
#     return predictions, true_labels, val_loss, accuracy
# # Training Loop
# epochs = 10
# for epoch in range(epochs):
#     train_loss, train_acc = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
#     test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)
#     print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")
#     logging.info(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}, Val Loss = {test_loss}, Val Acc = {test_acc}")
# test_preds, test_labels, test_loss, test_acc = evaluate(model, test_loader, device)

In [30]:
import pandas as pd
data = pd.read_csv("chat_summary_labels.csv")  # Load your CSV file with data
data_labels = data['labels'].to_list()

In [31]:
import json
def get_data_in_list(path):
    data_entries = []
    with open(path, 'r') as file:
        data = json.load(file)
        for i in range(len(data['data'])):
            row_entry = data['data'][i]
            try:
                row_specific_list = []
                for dict in row_entry:
                    for key in dict.keys():
                        statement = str(key) +" : " + str(dict[key])
                        row_specific_list.append(statement)
                data_entries.append(row_specific_list)
            except Exception as err:
                new_row_entry = row_entry["conversations"]
                row_specific_list = []
                for dict in new_row_entry:
                    for key in dict.keys():
                        statement = str(key) +" : " + str(dict[key])
                        row_specific_list.append(statement)
                data_entries.append(row_specific_list)
    return data_entries
data_entries = get_data_in_list(path = "sales_conversations.json")

In [None]:
YAHAN SE HAI CODE :::::

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import logging
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Logging setup
logging.basicConfig(filename='custom_transformer_training_logs.txt', level=logging.INFO, format='%(asctime)s - %(message)s')

# Example Conversations and Labels (List of Lists)
conversations = data_entries[0:550]
labels = data_labels # Extend to match the number of conversation lists

# Tokenization and Vocabulary
class Tokenizer:
    def __init__(self):
        self.word2idx = {"[PAD]": 0, "[UNK]": 1}
        self.idx2word = {0: "[PAD]", 1: "[UNK]"}
        self.word_count = {}

    def fit(self, texts):
        for text_list in texts:
            for text in text_list:  # Iterate over each conversation in the list
                for word in text.split():
                    if word not in self.word2idx:
                        idx = len(self.word2idx)
                        self.word2idx[word] = idx
                        self.idx2word[idx] = word
                    self.word_count[word] = self.word_count.get(word, 0) + 1

    def encode(self, text, max_len):
        tokens = [self.word2idx.get(word, 1) for word in text.split()]
        tokens = tokens[:max_len]
        return tokens + [0] * (max_len - len(tokens))

    def vocab_size(self):
        return len(self.word2idx)

# Instantiate and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit(conversations)

# Dataset Class
class ConversationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        # Flatten the list of lists into a single list of conversations
        self.texts = [item for sublist in texts for item in sublist]
        self.labels = labels * len(texts)  # Duplicate the labels for each sublist
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(text, self.max_len)
        return {
            "input_ids": torch.tensor(encoding, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.float)
        }

# Split Data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    conversations, labels, test_size=0.2, random_state=42
)

# Create Datasets and Loaders
max_len = 50
train_dataset = ConversationDataset(train_texts, train_labels, tokenizer, max_len)
test_dataset = ConversationDataset(test_texts, test_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Transformer Components
class ScaledDotProductAttention(nn.Module):
    def forward(self, query, key, value, mask=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = torch.softmax(scores, dim=-1)
        return torch.matmul(attention, value), attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        query = self.query(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        key = self.key(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        value = self.value(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

        attn_output, _ = ScaledDotProductAttention()(query, key, value, mask)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.fc(attn_output)

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, dropout_rate):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x

# Transformer Model
class CustomTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, max_len, num_classes, dropout_rate=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, d_model))
        self.layers = nn.ModuleList([TransformerBlock(d_model, n_heads, dropout_rate) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.positional_encoding[:, :input_ids.size(1), :]
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Global average pooling
        return self.sigmoid(self.fc(x))

# Initialize Model
vocab_size = tokenizer.vocab_size()
d_model = 128
n_heads = 4
n_layers = 2
num_classes = 1
model = CustomTransformer(vocab_size, d_model, n_heads, n_layers, max_len, num_classes).to(device)

# Training Setup
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCELoss()

# Training and Evaluation Functions
def train_model(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_model(model, loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids).squeeze()
            preds = (outputs > 0.5).float()

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# Training Loop
epochs = 10
for epoch in range(epochs):
    train_loss = train_model(model, train_loader)
    val_acc = evaluate_model(model, test_loader)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Validation Accuracy = {val_acc:.4f}")
    logging.info(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Validation Accuracy = {val_acc:.4f}")
    torch.save(model.state_dict(), f"model_epoch_{epoch+1}.pth")

torch.save(model.state_dict(), f"final_model.pth")

Using device: cuda


Training: 100%|██████████| 644/644 [00:02<00:00, 289.80it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1211.09it/s]


Epoch 1: Train Loss = 0.7029, Validation Accuracy = 0.4450


Training: 100%|██████████| 644/644 [00:02<00:00, 283.73it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1223.24it/s]


Epoch 2: Train Loss = 0.6957, Validation Accuracy = 0.5550


Training: 100%|██████████| 644/644 [00:02<00:00, 293.42it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1121.59it/s]


Epoch 3: Train Loss = 0.6924, Validation Accuracy = 0.4870


Training: 100%|██████████| 644/644 [00:02<00:00, 315.33it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1172.13it/s]


Epoch 4: Train Loss = 0.6874, Validation Accuracy = 0.5420


Training: 100%|██████████| 644/644 [00:02<00:00, 300.98it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1232.39it/s]


Epoch 5: Train Loss = 0.6651, Validation Accuracy = 0.5428


Training: 100%|██████████| 644/644 [00:02<00:00, 273.74it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1101.06it/s]


Epoch 6: Train Loss = 0.6317, Validation Accuracy = 0.4824


Training: 100%|██████████| 644/644 [00:02<00:00, 249.91it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1200.61it/s]


Epoch 7: Train Loss = 0.5890, Validation Accuracy = 0.5054


Training: 100%|██████████| 644/644 [00:02<00:00, 293.96it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1235.83it/s]


Epoch 8: Train Loss = 0.5538, Validation Accuracy = 0.4939


Training: 100%|██████████| 644/644 [00:02<00:00, 316.14it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1190.42it/s]


Epoch 9: Train Loss = 0.5148, Validation Accuracy = 0.5023


Training: 100%|██████████| 644/644 [00:02<00:00, 292.08it/s]
Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1229.78it/s]


Epoch 10: Train Loss = 0.4788, Validation Accuracy = 0.4946


In [37]:
s = evaluate_model(model, test_loader)
print(s)

Evaluating: 100%|██████████| 164/164 [00:00<00:00, 1140.93it/s]

0.49464831804281345





In [2]:
print(tokenizer.vocab_size())

NameError: name 'tokenizer' is not defined

In [None]:
# Example conversation
example_conversation = [
    "Hi, I'm looking to buy a new phone. Can you help me with that?",
    "Sure! What features are you looking for?"
]

# Tokenize and encode the example conversation
encoded_example = tokenizer.encode(" ".join(example_conversation), max_len)

# Convert to tensor and send to the device
input_tensor = torch.tensor([encoded_example], dtype=torch.long).to(device)

# Set the model to evaluation mode
model.eval()

# Get the prediction (probability of customer satisfaction)
with torch.no_grad():
    output = model(input_tensor).squeeze()  # Squeeze to get a single value
    prediction = (output > 0.5).float()  # Convert to 0 or 1 based on threshold
    predicted_score = output.item()  # Convert the output tensor to a scalar

# Print the prediction
print(f"Predicted Customer Satisfaction Probability: {predicted_score:.4f}")
print(f"Predicted Class (Satisfaction): {'Satisfied' if prediction.item() == 1 else 'Not Satisfied'}")


Predicted Customer Satisfaction Probability: 0.1716
Predicted Class (Satisfaction): Not Satisfied


In [None]:
def string_conversation_to_list():

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)



In [None]:
from tqdm import tqdm
def get_summary_list(input_text, tokenizer, model):
    conversations
    input_text = " ".join(conversations)
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=350,  # Adjust depending on the desired summary length
        min_length=40,   # Minimum length of the summary
        num_beams=4,     # Number of beams for beam search
        length_penalty=2.0
        
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer_1 = AutoTokenizer.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary")
model_1 = AutoModelForSeq2SeqLM.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary").to(device)
print(model_1.config)

