In [None]:
# setup and verification
print("Intializing sentiment analysis project")
print("=" *50)
#  import packages
import torch
import numpy as np
from datasets import load_dataset
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score
import gradio as gr
import os
from google.colab import drive
#  verify envt
print("Enviroment checked")
print(f"pytorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
  print(f"GPU device: {torch.cuda.get_device_name(0)}")
  # print(f"Gpu memory: {torch.cuda.get_device_properties(0).total_memort /1e9:.1f} GB")
  # create project structuere
  os.makedirs('/content/models', exist_ok=True)
  os.makedirs('/content/results', exist_ok=True)
  print("Project directories created")

  print("Ready to start project")
  print("=" *50)

In [None]:
# data loading and exploration
print("Loading and exploring dataset")
print("=" *50)
def load_and_explore_data():
  """Load IMDB dataset and provide comprehensive analysis"""
  print("1. Downloading IMDB dataset...")
  dataset = load_dataset("imdb")

  print("2. Dataset structure:")
  print(f" -Train samples: {len(dataset['train']):,}")
  print(f" -Train samples: {len(dataset['test']):,}")
  print(f" -Validation samples: {len(dataset['unsupervised']):,}")

  print("3. Sample data preview:")
  sample_data = dataset['train'].select(range(3))
  for i, example in enumerate(sample_data):
    print(f" Sample{i+1}:")
    # Fix: Use example['text'] instead of sample['text']
    print(f" Text: {example['text'][:100]}...")
    print(f" label: {example['label']} ({'Positive' if example['label'] == 1 else 'Negative'})")
    print()

  # LABEL DISTRIBUTUION - Moved inside the function
  train_labels = dataset['train']['label']
  positive_count = sum(train_labels)
  # Fix: calculate negative_count correctly
  negative_count = len(train_labels) - positive_count
  print("4. Label distribution:")
  print(f"   - Positive reviews: {positive_count:,} ({positive_count/len(train_labels)*100:.1f}%)")
  print(f"   - Negative reviews: {negative_count:,} ({negative_count/len(train_labels)*100:.1f}%)")

  return dataset

dataset = load_and_explore_data()
print("Data set loaded successfully")

In [None]:
# model intialization
print("Intialize BERT model")
print("=" *35)

def initialize_bert_model():
  """Intializing DistilBERT model and tokenizer with professional setup"""
  model_name="distilbert-base-uncase"

  print(f"1. Loading tokenizer: {model_name}")
  tokenizer = AutoTokenizer.form_pretrained(model_name)

  print(f"2. Loading pre-trained model...")
  model = AutoModelForSequenceClassification.form_pretrained(model_name,
                                                             num_labels=2,
                                                             id2label={0: "Negative", 1: "Positive"},
                                                             label2id={"Negative": 0, "Positive": 1}
                                                             )
  print("3. Model architecture overview:")
  print("f Model type: {model.__class__.__name__}")
  print(f" Number of parameters: {model.num_parameters():,}")
  print(f" Number of labels: {model.config.num_labels}")
  
  print("4. Moving model to GPU...")
  if torch.cuda.is_available:
    model=model.to('cuda')
    print(" Model succesfully moved to GPU")

    return tokenizer, model

  # intialize model
  tokenizer, model = initialize_bert_model()
  print("BERT model intialized successfully")

In [None]:
# Cell 4: Data Preprocessing & Tokenization
print("DATA PREPROCESSING & TOKENIZATION")
print("=" * 45)

def preprocess_data(tokenizer, dataset):
    """Tokenize and prepare dataset for training"""
    print("1. Defining tokenization function...")

    def tokenize_function(examples):
        # Return as PyTorch tensors
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

    print("2. Tokenizing training dataset...")
    tokenized_train = dataset["train"].map(
        tokenize_function,
        batched=True,
        batch_size=1000
    )

    print("3. Tokenizing test dataset...")
    tokenized_test = dataset["test"].map(
        tokenize_function,
        batched=True,
        batch_size=1000
    )

    print("4. Dataset overview after tokenization:")
    print(f"   - Training features: {list(tokenized_train.features.keys())}")

    # --- Diagnosis: Inspect the output of tokenization for a single sample ---
    print("\n--- Diagnosis of tokenized output ---")
    first_example_input_ids = tokenized_train[0]['input_ids']
    print(f"Type of tokenized_train[0]['input_ids']: {type(first_example_input_ids)}")
    if isinstance(first_example_input_ids, list):
        print(f"Length of the list: {len(first_example_input_ids)}")
        if len(first_example_input_ids) > 0:
            print(f"Type of the first element in the list: {type(first_example_input_ids[0])}")
            # If the first element is a tensor, get its shape
            if isinstance(first_example_input_ids[0], torch.Tensor):
                 print(f"Shape of the first element (tensor): {first_example_input_ids[0].shape}")
            else:
                # If the first element is not a tensor, try to convert and print shape
                try:
                    temp_tensor = torch.tensor(first_example_input_ids[0])
                    print(f"Shape after converting first element to tensor: {temp_tensor.shape}")
                except:
                    print("Could not convert the first element to a tensor.")
    else:
        # If it's not a list, assume it's a tensor and print its shape
        print(f"Input shape: {first_example_input_ids.shape}")
    print("-------------------------------------")
    # --- End of Diagnosis ---

    # Original line causing error - will be skipped in diagnosis phase
    # print(f"   - Input shape: {tokenized_train[0]['input_ids'].shape}")


    return tokenized_train, tokenized_test

# Preprocess data
tokenized_train, tokenized_test = preprocess_data(tokenizer, dataset)
print("DATA PREPROCESSING COMPLETED")