<a href="https://colab.research.google.com/github/MF884/Machine-Learning-Projects/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW  # Correct import location
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt


# 6. Training setup
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=3e-5)  # This line stays the same, just the import changed
total_steps = len(train_loader) * EPOCHS

warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Constants
MAX_LEN = 256
BATCH_SIZE = 16 if torch.cuda.is_available() else 8
EPOCHS = 20
LEARNING_RATE = 1e-5
THRESHOLD = 0.5

# 1. Enhanced Data Processor with Shape Validation
class DataProcessor:
    def __init__(self):
        self.sentiment_keywords = {
            'positive': {'thanks': 2, 'great': 2, 'solved': 3, 'love': 2, '😊': 3},
            'negative': {'😡': 3, 'sucks': 3, 'hate': 3, 'broken': 2, 'painful': 2}
        }
        self.issue_keywords = {
            'technical': ['broken', 'crash', 'freezes', 'bug', 'error'],
            'service': ['delay', 'refund', 'callback', 'support'],
            'performance': ['battery', 'slow', 'speed', 'drains']
        }

    def load_and_validate_data(self, file_path):
        """Load data and validate shapes"""
        df = pd.read_csv(file_path)
        df['text'] = df['text'].str.replace(r'http\S+', '', regex=True)
        df = df.dropna(subset=['text'])  # Remove empty texts

        # Generate labels
        df['labels'] = df['text'].apply(self.generate_labels)

        # Verify all samples got labels
        assert len(df['text']) == len(df['labels']), "Text and labels length mismatch"

        return df

    def generate_labels(self, text):
        """Ensure consistent label format"""
        text_lower = text.lower()
        labels = []

        # Sentiment
        pos_score = sum(weight for word, weight in self.sentiment_keywords['positive'].items()
                       if word in text_lower)
        neg_score = sum(weight for word, weight in self.sentiment_keywords['negative'].items()
                       if word in text_lower)

        if pos_score > neg_score:
            labels.append('positive')
        elif neg_score > pos_score:
            labels.append('negative')
        else:
            labels.append('neutral')

        # Issues
        for issue_type, keywords in self.issue_keywords.items():
            if any(keyword in text_lower for keyword in keywords):
                labels.append(issue_type)

        return labels

# 2. Dataset Class with Shape Validation
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        assert len(texts) == len(labels), f"Texts ({len(texts)}) and labels ({len(labels)}) length mismatch"
        self.texts = texts.reset_index(drop=True)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

# 3. Model Training with Safe Splitting
def safe_train_test_split(texts, labels, test_size=0.2):
    """Guaranteed consistent splitting"""
    assert len(texts) == len(labels), f"Mismatched lengths: texts {len(texts)}, labels {len(labels)}"
    return train_test_split(
        texts,
        labels,
        test_size=test_size,
        random_state=42,
        stratify=None  # Removing stratification to ensure it works
    )

def main():
    # Initialize components
    processor = DataProcessor()

    # Load and validate data
    try:
        df = processor.load_and_validate_data('/content/drive/My Drive/Colab Notebooks/sample.csv')
    except Exception as e:
        print(f"Data loading failed: {e}")
        return

    # Prepare labels
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['labels'])
    print(f"Data shapes - X: {len(df)}, y: {len(y)}")

    # Safe splitting
    try:
        X_train, X_test, y_train, y_test = safe_train_test_split(df['text'], y)
        X_train, X_val, y_train, y_val = safe_train_test_split(X_train, y_train, test_size=0.25)
        print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
    except AssertionError as e:
        print(f"Splitting failed: {e}")
        return

    # Initialize tokenizer and datasets
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    try:
        train_dataset = TweetDataset(X_train, y_train, tokenizer, MAX_LEN)
        val_dataset = TweetDataset(X_val, y_val, tokenizer, MAX_LEN)
        test_dataset = TweetDataset(X_test, y_test, tokenizer, MAX_LEN)
    except AssertionError as e:
        print(f"Dataset creation failed: {e}")
        return

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Model setup
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(mlb.classes_),
        problem_type="multi_label_classification"
    ).to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )
    loss_fn = nn.BCEWithLogitsLoss().to(device)

    # Training loop
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            optimizer.zero_grad()

            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {
                    'input_ids': batch['input_ids'].to(device),
                    'attention_mask': batch['attention_mask'].to(device),
                    'labels': batch['labels'].to(device)
                }
                outputs = model(**inputs)
                val_loss += loss_fn(outputs.logits, inputs['labels']).item()

        print(f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            labels = batch['labels'].numpy()

            outputs = model(**inputs)
            preds = (torch.sigmoid(outputs.logits) > THRESHOLD).int().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=mlb.classes_, zero_division=0))

if __name__ == "__main__":
    main()