# Load and Aggregate (The Setup)

In [1]:
import pandas as pd
import pipeline_v3 # Your custom utility file
from sklearn.model_selection import train_test_split

# 1. Load Data and Select Columns
# We need 'title' to group by restaurant, and 'stars' for later tasks
df = pd.read_csv(r'C:\Users\nhatp\OneDrive - NOVAIMS\Desktop\this semester\projects\text mining project\data_atlanta\atlanta_restaurant_slice_2023.csv')
df = df[['title', 'categoryName', 'text', 'stars']]
df = df.rename(columns={"text": "raw_text"})
df["raw_text"] = df["raw_text"].astype(str)

# 2. The "Ambition" Step: Aggregate reviews by Restaurant
# This converts 53,566 reviews into ~1,000 Restaurants
print(f"Rows before aggregation: {len(df)}")
df_grouped = pipeline_v3.aggregate_reviews(df, group_col='title', text_col='raw_text', label_col='categoryName')
print(f"Rows after aggregation (Unique Restaurants): {len(df_grouped)}")

# 3. Define X and y
X = df_grouped['raw_text']
y = df_grouped['categoryName']

# 4. Split Data (Stratify is crucial due to class imbalance)
# We split ONLY ONCE to ensure both models use exactly the same train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Rows before aggregation: 53566
Rows after aggregation (Unique Restaurants): 296


# Pipeline A - The Baseline (TF-IDF)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download("punkt_tab")

# 1. Build Pipeline
# Note: We add custom stopwords to remove generic words that don't help classify cuisine
pipeline_tfidf = Pipeline([
    ('preprocessor', pipeline_v3.TextPreprocessor(
        lowercase=True, 
        lemmatize=True,
        custom_stopwords=['food', 'place', 'restaurant', 'service', 'time', 'back', 'great', 'good']
    )),
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# 2. Train and Predict
print("Training Baseline (TF-IDF)...")
pipeline_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipeline_tfidf.predict(X_test)

# 3. Evaluation
print("\n--- BASELINE REPORT (TF-IDF) ---")
print(classification_report(y_test, y_pred_tfidf))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nhatp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Training Baseline (TF-IDF)...

--- BASELINE REPORT (TF-IDF) ---
                          precision    recall  f1-score   support

     American restaurant       0.50      0.14      0.22         7
             Bar & grill       0.00      0.00      0.00         1
     Barbecue restaurant       0.50      1.00      0.67         1
    Breakfast restaurant       1.00      1.00      1.00         2
      Chicken restaurant       0.67      1.00      0.80         2
      Chinese restaurant       1.00      1.00      1.00         3
    Fast food restaurant       1.00      0.67      0.80         3
    Hamburger restaurant       0.67      1.00      0.80         2
       Indian restaurant       1.00      1.00      1.00         2
      Italian restaurant       1.00      1.00      1.00         6
     Japanese restaurant       1.00      1.00      1.00         2
Mediterranean restaurant       1.00      1.00      1.00         2
      Mexican restaurant       1.00      1.00      1.00        10
 New Americ

# Step 3: Pipeline B - The Challenger (BERT Embeddings)

In [5]:
from sentence_transformers import SentenceTransformer

# 1. Load Pre-trained Model
# 'all-MiniLM-L6-v2' is designed for speed and performance
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Encode Data
# Note: We pass raw text. BERT handles context/grammar better than cleaned text.
print("Generating BERT Embeddings... (This takes a moment)")
X_train_bert = bert_model.encode(X_train.tolist(), show_progress_bar=True)
X_test_bert = bert_model.encode(X_test.tolist(), show_progress_bar=True)

# 3. Train Classifier on Embeddings
clf_bert = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_bert.fit(X_train_bert, y_train)

# 4. Predict
y_pred_bert = clf_bert.predict(X_test_bert)

# 5. Evaluation
print("\n--- CHALLENGER REPORT (BERT) ---")
print(classification_report(y_test, y_pred_bert))

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Generating BERT Embeddings... (This takes a moment)


Batches: 100%|██████████| 8/8 [00:03<00:00,  2.24it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.24it/s]


--- CHALLENGER REPORT (BERT) ---
                          precision    recall  f1-score   support

     American restaurant       1.00      0.14      0.25         7
             Bar & grill       0.00      0.00      0.00         1
     Barbecue restaurant       0.33      1.00      0.50         1
    Breakfast restaurant       0.33      0.50      0.40         2
      Chicken restaurant       0.40      1.00      0.57         2
      Chinese restaurant       0.75      1.00      0.86         3
    Fast food restaurant       0.25      0.33      0.29         3
    Hamburger restaurant       0.50      1.00      0.67         2
       Indian restaurant       1.00      1.00      1.00         2
      Italian restaurant       1.00      0.83      0.91         6
     Japanese restaurant       0.67      1.00      0.80         2
Mediterranean restaurant       1.00      0.50      0.67         2
      Mexican restaurant       0.90      0.90      0.90        10
 New American restaurant       0.00      


