In [None]:
# setup and verification
print("Intializing sentiment analysis project")
print("=" *50)
#  import packages
import torch
import numpy as np
from datasets import load_dataset
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score
import gradio as gr
import os
from google.colab import drive
#  verify envt
print("Enviroment checked")
print(f"pytorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
  print(f"GPU device: {torch.cuda.get_device_name(0)}")
  # print(f"Gpu memory: {torch.cuda.get_device_properties(0).total_memort /1e9:.1f} GB")
  # create project structuere
  os.makedirs('/content/models', exist_ok=True)
  os.makedirs('/content/results', exist_ok=True)
  print("Project directories created")

  print("Ready to start project")
  print("=" *50)

In [None]:
# data loading and exploration
print("Loading and exploring dataset")
print("=" *50)
def load_and_explore_data():
  """Load IMDB dataset and provide comprehensive analysis"""
  print("1. Downloading IMDB dataset...")
  dataset = load_dataset("imdb")

  print("2. Dataset structure:")
  print(f" -Train samples: {len(dataset['train']):,}")
  print(f" -Train samples: {len(dataset['test']):,}")
  print(f" -Validation samples: {len(dataset['unsupervised']):,}")

  print("3. Sample data preview:")
  sample_data = dataset['train'].select(range(3))
  for i, example in enumerate(sample_data):
    print(f" Sample{i+1}:")
    print(f" Text: {example['text'][:100]}...")
    print(f" label: {example['label']} ({'Positive' if example['label'] == 1 else 'Negative'})")
    print()

  # LABEL DISTRIBUTUION - Moved inside the function
  train_labels = dataset['train']['label']
  positive_count = sum(train_labels)
  negative_count = len(train_labels) - positive_count
  print("4. Label distribution:")
  print(f"   - Positive reviews: {positive_count:,} ({positive_count/len(train_labels)*100:.1f}%)")
  print(f"   - Negative reviews: {negative_count:,} ({negative_count/len(train_labels)*100:.1f}%)")

  return dataset

dataset = load_and_explore_data()
print("Data set loaded successfully")

In [None]:
# model intialization
print("Intialize BERT model")
print("=" *35)

def initialize_bert_model():
  """Intializing DistilBERT model and tokenizer with professional setup"""
  model_name="distilbert-base-uncase"

  print(f"1. Loading tokenizer: {model_name}")
  tokenizer = AutoTokenizer.form_pretrained(model_name)

  print(f"2. Loading pre-trained model...")
  model = AutoModelForSequenceClassification.form_pretrained(model_name,
                                                             num_labels=2,
                                                             id2label={0: "Negative", 1: "Positive"},
                                                             label2id={"Negative": 0, "Positive": 1}
                                                             )
  print("3. Model architecture overview:")
  print("f Model type: {model.__class__.__name__}")
  print(f" Number of parameters: {model.num_parameters():,}")
  print(f" Number of labels: {model.config.num_labels}")
  
  print("4. Moving model to GPU...")
  if torch.cuda.is_available:
    model=model.to('cuda')
    print(" Model succesfully moved to GPU")

    return tokenizer, model

  # intialize model
  tokenizer, model = initialize_bert_model()
  print("BERT model intialized successfully")

In [None]:
# Cell 4: Data Preprocessing & Tokenization
print("DATA PREPROCESSING & TOKENIZATION")
print("=" * 45)

def preprocess_data(tokenizer, dataset):
    """Tokenize and prepare dataset for training"""
    print("1. Defining tokenization function...")

    def tokenize_function(examples):
        # Return as PyTorch tensors
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

    print("2. Tokenizing training dataset...")
    tokenized_train = dataset["train"].map(
        tokenize_function,
        batched=True,
        batch_size=1000
    )

    print("3. Tokenizing test dataset...")
    tokenized_test = dataset["test"].map(
        tokenize_function,
        batched=True,
        batch_size=1000
    )

    print("4. Dataset overview after tokenization:")
    print(f"   - Training features: {list(tokenized_train.features.keys())}")

    # --- Diagnosis: Inspect the output of tokenization for a single sample ---
    print("\n--- Diagnosis of tokenized output ---")
    first_example_input_ids = tokenized_train[0]['input_ids']
    print(f"Type of tokenized_train[0]['input_ids']: {type(first_example_input_ids)}")
    if isinstance(first_example_input_ids, list):
        print(f"Length of the list: {len(first_example_input_ids)}")
        if len(first_example_input_ids) > 0:
            print(f"Type of the first element in the list: {type(first_example_input_ids[0])}")
            # If the first element is a tensor, get its shape
            if isinstance(first_example_input_ids[0], torch.Tensor):
                 print(f"Shape of the first element (tensor): {first_example_input_ids[0].shape}")
            else:
                # If the first element is not a tensor, try to convert and print shape
                try:
                    temp_tensor = torch.tensor(first_example_input_ids[0])
                    print(f"Shape after converting first element to tensor: {temp_tensor.shape}")
                except:
                    print("Could not convert the first element to a tensor.")
    else:
        # If it's not a list, assume it's a tensor and print its shape
        print(f"Input shape: {first_example_input_ids.shape}")
    print("-------------------------------------")
    # --- End of Diagnosis ---

    # Original line causing error - will be skipped in diagnosis phase
    # print(f"   - Input shape: {tokenized_train[0]['input_ids'].shape}")


    return tokenized_train, tokenized_test

# Preprocess data
tokenized_train, tokenized_test = preprocess_data(tokenizer, dataset)
print("DATA PREPROCESSING COMPLETED")

In [None]:
# Cell 5: Training Configuration
print(" TRAINING CONFIGURATION")
print("=" * 30)

def setup_training():
    """Configure professional training parameters"""

    training_args = TrainingArguments(
        # Output settings
        output_dir="./results",
        overwrite_output_dir=True,

        # Training parameters
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,

        # Evaluation settings
        # Changed evaluation_strategy to eval_strategy
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",

        # Logging
        logging_dir="./logs",
        logging_steps=500,
        report_to="none",

        # Optimization
        warmup_steps=500,
        fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    )

    print(" TRAINING CONFIGURATION:")
    print(f"   - Epochs: {training_args.num_train_epochs}")
    print(f"   - Batch size: {training_args.per_device_train_batch_size}")
    print(f"   - Learning rate: {training_args.learning_rate}")
    print(f"   - Evaluation: {training_args.eval_strategy}") # Changed to eval_strategy
    print(f"   - FP16 (speedup): {training_args.fp16}")

    return training_args

training_args = setup_training()
print(" TRAINING CONFIGURATION COMPLETE")

In [None]:
# Cell 6: Evaluation Metrics & Trainer
print("EVALUATION METRICS SETUP")
print("=" * 35)

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    
    return {
        "accuracy": accuracy,
        "f1_score": f1,
        "error_rate": 1 - accuracy
    }

print("1. Setting up Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

print("2. Trainer configuration:")
print(f"   - Training samples: {len(tokenized_train):,}")
print(f"   - Evaluation samples: {len(tokenized_test):,}")
print(f"   - Total steps: {training_args.num_train_epochs * len(tokenized_train) // training_args.per_device_train_batch_size:,}")

print("TRAINER SETUP COMPLETE")

In [None]:
# Cell 7: MODEL TRAINING EXECUTION
print("üèãÔ∏è STARTING MODEL TRAINING")
print("=" * 35)

print("1.  TRAINING OVERVIEW:")
print(f"   - Model: DistilBERT (BERT's faster cousin)")
print(f"   - Task: Learn if movie reviews are Positive or Negative")
print(f"   - Data: 25,000 movie reviews")
print(f"   - Time: 20-30 minutes with GPU")

print("\n2. TRAINING STARTED...")
print("   This is where your AI learns from the data!")
print("   The coach (Trainer) is now teaching your model...")
print("   Grab a coffee!")

# THIS IS WHERE MAGIC HAPPENS!
training_results = trainer.train()

print("\n3. TRAINING COMPLETED!")
print("=" * 30)
print("   Your AI has finished learning!")
print("   Now let's see how well it learned...")

# Show training results
print(f"   - Total time: {training_results.metrics['train_runtime']:.0f} seconds")
print(f"   - Final loss: {training_results.metrics['train_loss']:.4f}")
print(f"   - Speed: {training_results.metrics['train_samples_per_second']:.1f} samples/second")

print("\nüéâ PHASE 1 COMPLETE: MODEL IS TRAINED!")

In [None]:
# model evaluation
print("TESTING AI KNOWLEGDE")
print("=" *35)

print("1.  FINAL EXAM TIME!")
print("   Now testing your model on 25,000 NEW reviews")
print("   It has never seen these before!")

# test the model
eval_results = trainer.evaluate()

print("\n2.  EXAM RESULTS:")
print("   " + "="*20)
accuracy = eval_results['eval_accuracy']
f1 = eval_results['eval_f1_score']

print(f"   Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"    F1-Score: {f1:.4f}")
print(f"    Error Rate: {1-accuracy:.4f} ({(1-accuracy)*100:.2f}%)")

print("\n3.  GRADE INTERPRETATION:")
if accuracy > 0.92:
    print("    EXCELLENT! Your AI is smarter than 92% of humans!")
elif accuracy > 0.88:
    print("    VERY GOOD! Professional-level performance!")
elif accuracy > 0.85:
    print("    GOOD! Solid understanding of sentiment!")
else:
    print("    NEEDS PRACTICE! But still better than guessing!")

print(f"\n4. RANDOM GUESS COMPARISON:")
print(f"   - Random guessing: 50.00%")
print(f"   - Your AI: {accuracy*100:.2f}%")
print(f"   - Improvement: +{(accuracy-0.5)*100:.2f}%")

print("\n PHASE 2 COMPLETE: MODEL EVALUATED!")

In [None]:
# Cell 9: COMPREHENSIVE ANALYSIS & MODEL SAVING
print("COMPREHENSIVE PERFORMANCE ANALYSIS")
print("=" * 45)

print("1. DETAILED PERFORMANCE BREAKDOWN:")
print("   " + "="*30)

# Get your evaluation results (from Cell 8)
accuracy = eval_results['eval_accuracy']
f1 = eval_results['eval_f1_score']
error_rate = 1 - accuracy

print(f"   Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   F1-Score: {f1:.4f}")
print(f"    Error Rate: {error_rate:.4f} ({error_rate*100:.2f}%)")
print(f"    Evaluation Loss: {eval_results['eval_loss']:.4f}")

print("\n2.  PERFORMANCE BENCHMARKING:")
if accuracy > 0.92:
    print("    EXCELLENT! State-of-the-art performance!")
    print("   Your model outperforms most commercial systems!")
elif accuracy > 0.88:
    print("    VERY GOOD! Professional-grade AI!")
    print("   Your model understands sentiment exceptionally well!")
elif accuracy > 0.85:
    print("    GOOD! Solid commercial performance!")
    print("   Your model learned the patterns effectively!")
else:
    print("    SATISFACTORY! Good learning achieved!")
    print("   Your model significantly beats random guessing!")

print(f"\n3. INDUSTRY COMPARISON:")
print(f"   - Random Guessing: 50.00%")
print(f"   - Basic AI Models: 70-80%")
print(f"   - Commercial Systems: 85-90%")
print(f"   - Your Model: {accuracy*100:.2f}%")
print(f"   - Research State-of-Art: 92-95%")

print(f"\n4. TECHNICAL PERFORMANCE:")
print(f"   - Evaluation Time: {eval_results['eval_runtime']:.2f} seconds")
print(f"   - Samples Processed: {len(tokenized_test):,}")
print(f"   - Processing Speed: {eval_results['eval_samples_per_second']:.1f} samples/second")

print("\n5.  SAVING YOUR TRAINED MODEL...")
model_save_path = "/content/models/sentiment_model"

# Save your trained model
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("    Model saved successfully!")
print(f"   Location: {model_save_path}")

print("\n CELL 9 COMPLETE: ANALYSIS DONE & MODEL SAVED!")

In [None]:
# Cell 10: COMPLETE MODEL SAVING SOLUTION
print("COMPREHENSIVE MODEL SAVING")
print("=" * 40)

print("1. SAVING MODEL FILES IN COLAB...")
model_save_path = "/content/models/sentiment_model"

# Save model components
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("2. MODEL COMPONENTS SAVED:")
import os
model_files = os.listdir(model_save_path)
for file in model_files:
    file_size = os.path.getsize(f"{model_save_path}/{file}") / (1024*1024)
    print(f"    {file} ({file_size:.1f} MB)")

print("\n3. CREATING PROFESSIONAL DOCUMENTATION...")
model_card = f"""
#  Custom Sentiment Analysis Model

## Quick Facts
- **Accuracy**: {accuracy*100:.2f}% on IMDB test set
- **Size**: ~250MB
- **Training Time**: 30 minutes
- **Best For**: Movie reviews, product reviews, social media

## Files Included
- `pytorch_model.bin` - AI model weights
- `config.json` - Model architecture  
- `tokenizer.json` - Text processing rules
- `README.md` - This documentation

## Usage
```python
from transformers import pipeline
model = pipeline("sentiment-analysis", 
                model="path/to/sentiment_model")
result = model("This movie was amazing!")

"""

In [None]:
# Cell 11: LIVE DEMO CREATION
print(" CREATING LIVE DEMO INTERFACE")
print("=" * 40)

print("1. üîÑ LOADING YOUR TRAINED MODEL...")
from transformers import pipeline

# Load YOUR custom trained model (not a pre-trained one!)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model_save_path,  # This is YOUR model
    tokenizer=tokenizer
)

print("2. ‚úÖ MODEL LOADED SUCCESSFULLY!")
print(f"   - Model: {model_save_path}")
print(f"   - Accuracy: {accuracy*100:.2f}%")
print(f"   - Training: 25,000 IMDB reviews")

print("\n3. üé™ CREATING PREDICTION FUNCTION...")
def analyze_sentiment(text):
    """
    This function uses YOUR custom trained model to analyze sentiment
    """
    if not text.strip():
        return "‚ùå Please enter some text to analyze!"
    
    try:
        # Get prediction from YOUR trained model
        result = sentiment_pipeline(text)[0]
        
        # Convert model output to readable format
        label = "Positive" if result['label'] == 'LABEL_1' else "Negative"
        confidence = result['score']
        
        # Create professional output
        output = f"""üéØ SENTIMENT: {label.upper()}
üìä CONFIDENCE: {confidence:.2%}
üí¨ ANALYSIS: {'üòä Positive emotion detected' if label == 'Positive' else 'üòû Negative emotion detected'}
‚≠ê MODEL PERFORMANCE: {accuracy*100:.1f}% accurate on test data

üîç DETAILS:
- Model: Your Custom DistilBERT
- Training: 25,000 IMDB reviews
- Task: Binary sentiment classification"""
        
        return output
        
    except Exception as e:
        return f"‚ùå Error in analysis: {str(e)}"

print("4. üñ•Ô∏è BUILDING PROFESSIONAL WEB INTERFACE...")
demo = gr.Interface(
    fn=analyze_sentiment,  # Function to call
    inputs=gr.Textbox(
        lines=3,
        placeholder="Enter your movie review, product feedback, or any text here...",
        label="üìù TEXT TO ANALYZE"
    ),
    outputs=gr.Textbox(
        label="üéØ SENTIMENT ANALYSIS RESULT",
        show_copy_button=True
    ),
    title="üé¨ AI SENTIMENT ANALYZER - YOUR CUSTOM MODEL",
    description=f"""**ü§ñ Powered by YOUR Custom AI Model**

üìä **Model Performance:** {accuracy*100:.2f}% accuracy | Trained on IMDB reviews

‚ú® **Features:**
- Real-time sentiment analysis
- Confidence scoring
- Professional-grade AI
- Your custom trained model

**Enter any text and see YOUR AI in action!**""",
    
    examples=[
        ["This movie was absolutely fantastic! The acting was superb and the story was captivating from beginning to end."],
        ["I hated this film. It was boring, poorly acted, and the plot made no sense whatsoever. Waste of time."],
        ["The cinematography was beautiful and visually stunning, but the characters were poorly developed and shallow."],
        ["One of the best movies I've ever seen! The direction, acting, and screenplay were all perfect. Highly recommended!"],
        ["Terrible movie with awful acting and a confusing storyline. I regret spending money on this."],
        ["It was okay, nothing special. Some good moments but overall pretty mediocre and forgettable."]
    ],
    theme="soft"
)

print("5. üöÄ LAUNCHING LIVE DEMO...")
print("   " + "="*30)
print("   ‚≠ê **IMPORTANT FOR YOUR PROJECT:** ‚≠ê")
print("   - A new tab will open with your demo")
print("   - You will get a PUBLIC URL")
print("   - SHARE THIS URL WITH YOUR PROFESSOR!")
print("   - This demonstrates your working AI system!")
print("   " + "="*30)

# Launch the demo with public sharing
demo.launch(share=True, debug=True)