#Yelp Restaurant Review Sentiment Analysis Advisor

##**Hugging Face Dataset:** fancyzhx/yelp_polarity

##**Hugging Face Model:** distilbert-base-uncased

##**LabeIDs:** Positive and Negative Standing

#*Fine-Tuned Sentiment Model:*

In [1]:
# First, install the libraries needed
!pip install transformers datasets accelerate -U -q

# Import libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, pipeline
import numpy as np
from sklearn.metrics import accuracy_score
import os

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Print what this code is about to do
print("="*70)
print("RESTAURANT REVIEW SENTIMENT ANALYSIS")
print("Fine-Tuning DistilBERT on Yelp Reviews")
print("="*70)

# Let's load the Yelp review dataset
print("\n[1/7] Loading Yelp review polarity dataset...")

# Load the dataset from Hugging Face
dataset = load_dataset("fancyzhx/yelp_polarity")

print(f"✅ Dataset loaded successfully!")
print(f"Training samples: {len(dataset['train']):,}")
print(f"Test samples: {len(dataset['test']):,}")
print(f"Columns: {dataset['train'].column_names}")

# Show the example
print(f"\nExample review:")
example = dataset['train'][0]
print(f"Text: {example['text'][:200]}...")
print(f"Label: {example['label']} (0=Negative, 1=Positive)")

# Sample the data
print("\n[2/7] Sampling data for faster training...")

# Sample 50,000 for training
# Sample 10,000 for testing
train_sample_size = 50000
test_sample_size = 10000

train_dataset = dataset['train'].shuffle(seed=42).select(range(train_sample_size))
test_dataset = dataset['test'].shuffle(seed=42).select(range(test_sample_size))

print(f"✅ Training samples: {len(train_dataset):,}")
print(f"✅ Test samples: {len(test_dataset):,}")

# Check labels for positive/negative distribution
from collections import Counter
train_labels = Counter(train_dataset['label'])
print(f"\nTraining label distribution:")
print(f"  Negative (0): {train_labels[0]:,} ({train_labels[0]/len(train_dataset)*100:.1f}%)")
print(f"  Positive (1): {train_labels[1]:,} ({train_labels[1]/len(train_dataset)*100:.1f}%)")

# Load the DistilBERT model
print("\n[3/7] Loading DistilBERT model and tokenizer...")

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Classification separation
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

print(f"✅ Model loaded with num_labels=2 (binary classification)")
print(f"   Base model: {model_name}")
print(f"   Labels: 0=Negative (Bad review), 1=Positive (Good review)")

#Tokenize the data
print("\n[4/7] Tokenizing reviews...")

def tokenize_function(examples):
    """Tokenize the review text"""
    return tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("✅ Tokenization complete")

# Prepare for training
print("\n[5/7] Preparing data for training...")

tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_test = tokenized_test.rename_column('label', 'labels')

# PyTorch Format
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(f"✅ Data prepared for training")
print(f"   Columns: {tokenized_train.column_names}")

# Training Configuration
print("\n[6/7] Setting up training configuration...")

def compute_metrics(eval_pred):
    """Compute accuracy metrics"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Training Arguments
training_args = TrainingArguments(
    output_dir="./restaurant_review_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,
)

# Make a trainer to train the
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("✅ Training configuration ready")

# Model training
print("\n[7/7] Starting training...")
print("\n" + "="*70)
print("🚀 TRAINING IN PROGRESS")
print("="*70)
print(f"Training samples: {len(tokenized_train):,}")
print(f"Test samples: {len(tokenized_test):,}")
print(f"Epochs: 3")
print(f"Expected time: 45-60 minutes")
print(f"Expected accuracy: 85-92%")
print("="*70 + "\n")

trainer.train()

print("\n" + "="*70)
print("✅ TRAINING COMPLETE!")
print("="*70)

# Evaluate the results
print("\nEvaluating model on test set...")

results = trainer.evaluate()

print("\n" + "="*70)
print("📊 FINAL EVALUATION RESULTS:")
print("="*70)
for key, value in results.items():
    print(f"  {key}: {value:.4f}")
print("="*70)

# Save it
print("\nSaving model...")

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model.config.id2label = id2label
model.config.label2id = label2id

model_save_path = "restaurant_sentiment_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✅ Model saved to: {model_save_path}")

# Test it out and see if it works
print("\n" + "="*70)
print("🧪 TESTING MODEL ON SAMPLE REVIEWS")
print("="*70 + "\n")

sentiment_analyzer = pipeline("sentiment-analysis", model=model_save_path)

test_reviews = [
    "This restaurant was absolutely amazing! Best food I've ever had.",
    "Terrible service and the food was cold. Never coming back.",
    "The coffee was perfect and the atmosphere was so cozy!",
    "Worst experience ever. Rude staff and overpriced food.",
    "Loved every minute of it! Will definitely return soon.",
    "Disappointed. The place was dirty and food was mediocre.",
    "Great ambiance, friendly staff, and delicious food!",
    "Waited over an hour for our food. Not worth it at all.",
]

for i, review in enumerate(test_reviews, 1):
    result = sentiment_analyzer(review)[0]
    emoji = {"NEGATIVE": "🔴", "POSITIVE": "🟢"}.get(result['label'], "⚪")

    print(f"Review {i}: {review}")
    print(f"→ Sentiment: {emoji} {result['label']} (confidence: {result['score']:.1%})")
    print()

print("="*70)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/12.0 MB[0m [31m110.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m8.3/12.0 MB[0m [31m115.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.0/12.0 MB[0m [31m114.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.0/12.0 MB[0m [31m114.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

✅ Dataset loaded successfully!
Training samples: 560,000
Test samples: 38,000
Columns: ['text', 'label']

Example review:
Text: Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply ...
Label: 0 (0=Negative, 1=Positive)

[2/7] Sampling data for faster training...
✅ Training samples: 50,000
✅ Test samples: 10,000

Training label distribution:
  Negative (0): 25,221 (50.4%)
  Positive (1): 24,779 (49.6%)

[3/7] Loading DistilBERT model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded with num_labels=2 (binary classification)
   Base model: distilbert-base-uncased
   Labels: 0=Negative (Bad review), 1=Positive (Good review)

[4/7] Tokenizing reviews...


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ Tokenization complete

[5/7] Preparing data for training...
✅ Data prepared for training
   Columns: ['text', 'labels', 'input_ids', 'attention_mask']

[6/7] Setting up training configuration...


  trainer = Trainer(


✅ Training configuration ready

[7/7] Starting training...

🚀 TRAINING IN PROGRESS
Training samples: 50,000
Test samples: 10,000
Epochs: 3
Expected time: 45-60 minutes
Expected accuracy: 85-92%



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1525,0.133506,0.9519
2,0.1027,0.15812,0.9593
3,0.0486,0.197277,0.9588



✅ TRAINING COMPLETE!

Evaluating model on test set...



📊 FINAL EVALUATION RESULTS:
  eval_loss: 0.1335
  eval_accuracy: 0.9519
  eval_runtime: 69.7992
  eval_samples_per_second: 143.2680
  eval_steps_per_second: 8.9540
  epoch: 3.0000

Saving model...


Device set to use cuda:0


✅ Model saved to: restaurant_sentiment_model

🧪 TESTING MODEL ON SAMPLE REVIEWS

Review 1: This restaurant was absolutely amazing! Best food I've ever had.
→ Sentiment: 🟢 POSITIVE (confidence: 99.9%)

Review 2: Terrible service and the food was cold. Never coming back.
→ Sentiment: 🔴 NEGATIVE (confidence: 99.9%)

Review 3: The coffee was perfect and the atmosphere was so cozy!
→ Sentiment: 🟢 POSITIVE (confidence: 99.9%)

Review 4: Worst experience ever. Rude staff and overpriced food.
→ Sentiment: 🔴 NEGATIVE (confidence: 99.9%)

Review 5: Loved every minute of it! Will definitely return soon.
→ Sentiment: 🟢 POSITIVE (confidence: 99.9%)

Review 6: Disappointed. The place was dirty and food was mediocre.
→ Sentiment: 🔴 NEGATIVE (confidence: 99.9%)

Review 7: Great ambiance, friendly staff, and delicious food!
→ Sentiment: 🟢 POSITIVE (confidence: 99.9%)

Review 8: Waited over an hour for our food. Not worth it at all.
→ Sentiment: 🔴 NEGATIVE (confidence: 99.9%)



#*RAG System*

In [2]:
# Install all of the necessary libraries
!pip install chromadb sentence-transformers transformers torch datasets -q

import chromadb
from chromadb.config import Settings
from transformers import pipeline
from datasets import load_dataset
from typing import Dict
import random

print("="*70)
print("TRUE RAG SYSTEM - Using Yelp Reviews as Knowledge Base")
print("="*70)

# Load the same Yelp review dataset as before
print("\n[1/5] Loading Yelp Review dataset...")

dataset = load_dataset("fancyzhx/yelp_polarity", split="train")

print(f"✅ Dataset loaded: {len(dataset):,} total reviews")

# Sample reviews
print("\n[2/5] Sampling reviews for RAG knowledge base...")

sample_size = 5000

sampled_reviews = dataset.shuffle(seed=42).select(range(sample_size))

print(f"✅ Sampled {sample_size:,} reviews for RAG knowledge base")

# Review positive and negative distribution
from collections import Counter
label_dist = Counter(sampled_reviews['label'])
print(f"\nReview distribution:")
print(f"  Positive reviews: {label_dist[1]:,}")
print(f"  Negative reviews: {label_dist[0]:,}")

# Create a database
print("\n[3/5] Creating ChromaDB vector database...")

# Use chroma
chroma_client = chromadb.Client(Settings(
    anonymized_telemetry=False,
    allow_reset=True
))

try:
    chroma_client.delete_collection(name="yelp_reviews")
except:
    pass

collection = chroma_client.create_collection(name="yelp_reviews")

# Add the reviews to the database created
print("\nAdding reviews to vector database...")

documents = []
metadatas = []
ids = []

for idx, review in enumerate(sampled_reviews):
    if len(review['text']) >= 50:
        documents.append(review['text'])
        metadatas.append({
            'sentiment': 'positive' if review['label'] == 1 else 'negative',
            'review_id': idx
        })
        ids.append(f"review_{idx}")

# Include the batch size
batch_size = 1000
for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_metas = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]

    collection.add(
        documents=batch_docs,
        metadatas=batch_metas,
        ids=batch_ids
    )

    if (i + batch_size) % 1000 == 0:
        print(f"  Added {min(i+batch_size, len(documents)):,}/{len(documents):,} reviews...")

print(f"\n✅ Vector database created with {len(documents):,} reviews")

# Load model for generation
print("\n[4/5] Loading FLAN-T5 for answer generation...")

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    max_length=200,
    device=-1
)

print("✅ Generation model loaded")

# Create a TRUE RAG
print("\n[5/5] Creating TRUE RAG function...")

def rag_query_reviews(question: str, n_results: int = 3) -> Dict:
    """
    TRUE RAG using Yelp reviews as knowledge base

    1. RETRIEVAL: Search for relevant restaurant reviews
    2. AUGMENTATION: Combine reviews with question
    3. GENERATION: LLM generates answer based on reviews

    Args:
        question: User's question about restaurants
        n_results: Number of reviews to retrieve

    Returns:
        Dictionary with retrieved reviews and generated answer
    """
    if not question.strip():
        return {
            'question': question,
            'retrieved_reviews': [],
            'generated_answer': "Please ask a question about restaurants."
        }

   # Retrieve relevant reviews
    results = collection.query(
        query_texts=[question],
        n_results=n_results
    )

    if not results['documents'][0]:
        return {
            'question': question,
            'retrieved_reviews': [],
            'generated_answer': "I couldn't find relevant reviews for that question."
        }

    # Extract the reviews
    retrieved_reviews = []
    review_texts = []

    for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
        sentiment = metadata.get('sentiment', 'unknown')
        retrieved_reviews.append({
            'text': doc,
            'sentiment': sentiment
        })
        review_texts.append(f"[{sentiment.upper()}] {doc}")

    # Augment relevant reviews context
    context = "\n\n".join(review_texts)

    # Prompt Creation
    augmented_prompt = f"""Based on these customer reviews, answer the question.

Customer Reviews:
{context}

Question: {question}

Answer based on the reviews:"""

    # Generate an answer
    generated_answer = generator(augmented_prompt, max_length=200)[0]['generated_text']

    return {
        'question': question,
        'retrieved_reviews': retrieved_reviews,
        'generated_answer': generated_answer,
        'num_reviews_used': len(retrieved_reviews)
    }

print("✅ TRUE RAG function ready")

# Test the RAG FUNC
print("\n" + "="*70)
print("🧪 TESTING RAG WITH REAL YELP REVIEWS")
print("="*70 + "\n")

test_questions = [
    "What do customers say about the food quality?",
    "Are the portions good at restaurants?",
    "What about customer service?",
    "Do people recommend this for families?",
    "What are common complaints about restaurants?",
]

for i, question in enumerate(test_questions, 1):
    print(f"Question {i}: {question}")
    print("-" * 70)

    result = rag_query_reviews(question, n_results=3)

    print(f"\n📚 RETRIEVED REVIEWS ({result['num_reviews_used']} reviews):")
    for j, review in enumerate(result['retrieved_reviews'], 1):
        sentiment_emoji = "🟢" if review['sentiment'] == 'positive' else "🔴"
        print(f"\n  Review {j} {sentiment_emoji}:")
        print(f"  {review['text'][:150]}...")

    print(f"\n🤖 GENERATED ANSWER:")
    print(f"  {result['generated_answer']}")

    print("\n" + "="*70 + "\n")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m84.5 MB/s[0m eta [36m0:00:

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:09<00:00, 9.22MiB/s]


  Added 1,000/4,927 reviews...
  Added 2,000/4,927 reviews...
  Added 3,000/4,927 reviews...
  Added 4,000/4,927 reviews...
  Added 4,927/4,927 reviews...

✅ Vector database created with 4,927 reviews

[4/5] Loading FLAN-T5 for answer generation...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✅ Generation model loaded

[5/5] Creating TRUE RAG function...
✅ TRUE RAG function ready

🧪 TESTING RAG WITH REAL YELP REVIEWS

Question 1: What do customers say about the food quality?
----------------------------------------------------------------------

📚 RETRIEVED REVIEWS (3 reviews):

  Review 1 🟢:
  Very good food ~ always consistent ~ good buffet at lunch ~ not cheap but very freshly prepared with great care. Clean facility with friendly staff. O...

  Review 2 🔴:
  I'd give it zero if I could. Reading the 1 star reviews here completely match up to my experience: lousy food, grunting waitress/owner, vegetables und...

  Review 3 🔴:
  Not terribly impressed with the food (the pretzel is great but I've had the chicken burger and salad and it was eh). It's very loud and the service is...

🤖 GENERATED ANSWER:
  Very good food  always consistent  good buffet at lunch  not cheap but very freshly prepared with great care. Clean facility with friendly staff. Owners usually on site  lot

Token indices sequence length is longer than the specified maximum sequence length for this model (1183 > 512). Running this sequence through the model will result in indexing errors



📚 RETRIEVED REVIEWS (3 reviews):

  Review 1 🟢:
  Skinny Fats is AWESOME! What a great concept. Having a happy side and a Healthy side really shows how hard it is to choose whether to eat good or just...

  Review 2 🟢:
  When a family member suggested this place to meet for lunch, I was almost afraid to come after reading horrible reviews. I believe that those lowly re...

  Review 3 🟢:
  Tucked away in an unassuming strip mall, is this gem of healthy eating. Not too many reviews mention it, but this place has a stellar selection of sal...

🤖 GENERATED ANSWER:
  Yes


Question 3: What about customer service?
----------------------------------------------------------------------

📚 RETRIEVED REVIEWS (3 reviews):

  Review 1 🟢:
  Bumping the store down a notch for lack of customer service skills when answering the phone....

  Review 2 🔴:
  Service here is terrible! I ordered something off of walmart online and for some reason they canceled my order, came into this store to try to get s

##**Hugging Face Dataset:** `fancyzhx/yelp_polarity`

---



##**Database:** Chroma

Upload to Hugging Face

In [6]:
!pip install huggingface_hub -q

from huggingface_hub import HfApi, create_repo, login
import os

print("="*70)
print("UPLOADING MODEL TO HUGGING FACE HUB")
print("="*70)

# Log in to Hugging Face
print("\n[1/3] Logging in to Hugging Face...")

# Insert Token
login()

print("✅ Logged in successfully!")

# Create a repo
print("\n[2/3] Creating model repository...")

username = "Isap31"
model_name = "restaurant-sentiment-distilbert"
repo_id = f"{username}/{model_name}"

try:
    create_repo(
        repo_id=repo_id,
        repo_type="model",
        exist_ok=True,
        private=False
    )
    print(f"✅ Repository created: {repo_id}")
except Exception as e:
    print(f"Repository may already exist: {e}")

# Upload files
print("\n[3/3] Uploading model files...")

model_path = "restaurant_sentiment_model"

if not os.path.exists(model_path):
    print(f"❌ ERROR: Model not found at {model_path}")
    print("Make sure you ran the training code first!")
else:
    api = HfApi()

    print(f"Uploading files from {model_path}...")

    api.upload_folder(
        folder_path=model_path,
        repo_id=repo_id,
        repo_type="model",
    )

    print("✅ Model uploaded successfully!")
    print(f"\n{'='*70}")
    print(f"🎉 YOUR MODEL IS NOW AVAILABLE AT:")
    print(f"https://huggingface.co/{repo_id}")
    print(f"{'='*70}")

UPLOADING MODEL TO HUGGING FACE HUB

[1/3] Logging in to Hugging Face...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Logged in successfully!

[2/3] Creating model repository...
✅ Repository created: Isap31/restaurant-sentiment-distilbert

[3/3] Uploading model files...
Uploading files from restaurant_sentiment_model...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...t_model/model.safetensors:   0%|          |  574kB /  268MB            

✅ Model uploaded successfully!

🎉 YOUR MODEL IS NOW AVAILABLE AT:
https://huggingface.co/Isap31/restaurant-sentiment-distilbert
