In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/train_data.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['is_recommended'].value_counts()

In [None]:
df['sentiment'].value_counts()

In [None]:
df[['is_recommended', 'sentiment']].head()

In [None]:
# Load and prepare the dataset
def load_and_prepare_data(df):
    """
    Prepare dataframe for sentiment analysis
    """
    print(f"Preparing data...")

    # Check missing values in important columns
    print("Missing values:")
    print(df[['review_text', 'is_recommended']].isnull().sum())

    # Drop rows with missing reviews or target values
    df = df.dropna(subset=['review_text', 'is_recommended'])

    # Combine review title and text if title exists
    if 'review_title' in df.columns:
        df['full_review'] = df['review_title'].fillna('') + " " + df['review_text']
    else:
        df['full_review'] = df['review_text']

    # Convert target to binary (0 or 1)
    df['target'] = df['is_recommended'].astype(int)

    # Print basic statistics
    print(f"Total samples: {len(df)}")
    print(f"Recommended count: {df['target'].sum()} ({df['target'].mean()*100:.1f}%)")
    print(f"Not recommended count: {len(df) - df['target'].sum()} ({(1-df['target'].mean())*100:.1f}%)")

    return df

# Create PyTorch dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len=128):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Training function with progress bar
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(data_loader, desc="Training")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)

        correct_predictions += torch.sum(preds == targets)
        total_predictions += len(targets)

        losses.append(loss.item())

        # Update progress bar with current loss and accuracy
        current_acc = torch.sum(preds == targets).double() / len(targets)
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}", 'acc': f"{current_acc:.4f}"})

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return np.mean(losses), correct_predictions.double() / total_predictions

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)

            correct_predictions += torch.sum(preds == targets)
            total_predictions += len(targets)

            losses.append(loss.item())

            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(targets.cpu().tolist())

    accuracy = correct_predictions.double() / total_predictions
    return np.mean(losses), accuracy, predictions, actual_labels

# Model training function
def train_sentiment_model(train_file, val_file, test_file, model_save_path='best_model.bin'):
    # Model parameters
    RANDOM_SEED = 42
    MAX_LEN = 128
    BATCH_SIZE = 16  # Adjust based on your GPU memory
    EPOCHS = 3
    LEARNING_RATE = 3e-5
    PATIENCE = 2  # Early stopping patience

    # Set random seeds
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)

    # Load data files
    try:
        print(f"Loading training data from {train_file}...")
        train_df_raw = pd.read_csv(train_file)

        print(f"Loading validation data from {val_file}...")
        val_df_raw = pd.read_csv(val_file)

        print(f"Loading test data from {test_file}...")
        test_df_raw = pd.read_csv(test_file)
    except FileNotFoundError as e:
        print(f"File not found: {str(e)}")
        return None, None

    # Apply preprocessing to each dataset
    train_df = load_and_prepare_data(train_df_raw)
    val_df = load_and_prepare_data(val_df_raw)
    test_df = load_and_prepare_data(test_df_raw)

    print(f"Training set: {len(train_df)} samples")
    print(f"Validation set: {len(val_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    # Load tokenizer and model
    print("Loading DistilBERT model and tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    model = model.to(device)

    # Create datasets
    train_dataset = ReviewDataset(
        reviews=train_df['full_review'].to_list(),
        targets=train_df['target'].to_list(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    val_dataset = ReviewDataset(
        reviews=val_df['full_review'].to_list(),
        targets=val_df['target'].to_list(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    test_dataset = ReviewDataset(
        reviews=test_df['full_review'].to_list(),
        targets=test_df['target'].to_list(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    # Create data loaders with optimized settings for Colab
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=2
    )

    # Optimization settings
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )

    # Training loop with early stopping
    best_accuracy = 0
    training_stats = []
    no_improvement_count = 0

    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')
        print('-' * 30)

        # Training phase
        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, device
        )

        print(f'Train loss: {train_loss:.4f}, accuracy: {train_acc:.4f}')

        # Validation phase
        val_loss, val_acc, _, _ = eval_model(
            model, val_loader, device
        )

        print(f'Val loss: {val_loss:.4f}, accuracy: {val_acc:.4f}')

        # Save training statistics
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_acc': train_acc.item(),
            'val_loss': val_loss,
            'val_acc': val_acc.item()
        })

        # Save best model
        if val_acc > best_accuracy:
            torch.save(model.state_dict(), model_save_path)
            best_accuracy = val_acc
            no_improvement_count = 0
            print(f"New best model saved to {model_save_path}!")
        else:
            no_improvement_count += 1
            print(f"No improvement for {no_improvement_count} epochs")

        # Early stopping
        if no_improvement_count >= PATIENCE:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break

    # Load best model for final evaluation
    print("\nLoading best model for evaluation...")
    model.load_state_dict(torch.load(model_save_path))
    test_loss, test_acc, predictions, actual_labels = eval_model(
        model, test_loader, device
    )

    print(f'\nTest accuracy: {test_acc:.4f}')
    print('\nClassification Report:')
    print(classification_report(actual_labels, predictions))

    # Plot confusion matrix
    cm = confusion_matrix(actual_labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')

    # Plot training and validation metrics
    stats_df = pd.DataFrame(training_stats)
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(stats_df['epoch'], stats_df['train_loss'], label='Train')
    plt.plot(stats_df['epoch'], stats_df['val_loss'], label='Validation')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(stats_df['epoch'], stats_df['train_acc'], label='Train')
    plt.plot(stats_df['epoch'], stats_df['val_acc'], label='Validation')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()

    plt.tight_layout()
    plt.savefig('training_metrics.png')

    return model, tokenizer

# Create a prediction function
def create_prediction_function(model, tokenizer, max_len=128):
    def predict_recommendation(review_text):
        model.eval()
        encoded_review = tokenizer.encode_plus(
            review_text,
            max_length=max_len,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

        return 'Recommended' if preds.item() == 1 else 'Not Recommended'

    return predict_recommendation

# Run the full pipeline
if __name__ == "__main__":
    # Configuration - update with your file paths
    train_file = '/content/drive/MyDrive/train_data.csv'
    val_file = '/content/drive/MyDrive/val_data.csv'
    test_file = '/content/drive/MyDrive/test_data.csv'
    model_save_path = '/content/drive/MyDrive/best_sentiment_model.bin'

    # Train model with all data files
    model, tokenizer = train_sentiment_model(train_file, val_file, test_file, model_save_path)

    if model and tokenizer:
        # Create prediction function
        predict = create_prediction_function(model, tokenizer)

        # Test with some examples
        examples = [
            "This product is amazing! I love it so much. It works perfectly.",
            "Waste of money. Broke after a week and customer service was terrible.",
            "It's okay, not the best but not the worst either."
        ]

        print("\nPrediction Examples:")
        for example in examples:
            prediction = predict(example)
            print(f"Review: {example}")
            print(f"Prediction: {prediction}\n")