In [1]:
from google.colab import drive
drive.mount("/content/drive")

import os
BASE_DIR = "/content/drive/MyDrive/irvine_recommender_outputs"  # <-- change if your folder name differs
DATA_DIR = BASE_DIR  # your CSVs are directly inside this folder based on screenshot

for f in ["ui_bundle.csv","oc_user.csv","oc_review.csv","oc_business.csv","irvine_bundle.csv"]:
    print(f, "exists?", os.path.exists(os.path.join(DATA_DIR, f)))


Mounted at /content/drive
ui_bundle.csv exists? True
oc_user.csv exists? True
oc_review.csv exists? True
oc_business.csv exists? True
irvine_bundle.csv exists? True


In [2]:
!pip -q install transformers accelerate torch pandas numpy tqdm


In [1]:
# ============================================
# Week 2 — Review Sentiment (Resume-Safe)
# - Reads oc_review.csv in chunks
# - Runs sentiment model on each chunk (GPU if available)
# - Writes Drive parts: review_sentiment_parts/part_000000.csv ...
# - If interrupted, re-running RESUMES (skips existing parts)
# ============================================

from google.colab import drive
drive.mount("/content/drive")

import os, glob
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ----------------------------
# 1) Paths (EDIT THIS)
# ----------------------------
OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs"  # <-- change if needed
REVIEWS_PATH = os.path.join(OUT_DIR, "oc_review.csv")
PARTS_DIR = os.path.join(OUT_DIR, "review_sentiment_parts")
os.makedirs(PARTS_DIR, exist_ok=True)

# ----------------------------
# 2) Model + runtime settings
# ----------------------------
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

CHUNK_SIZE = 20000     # must stay the same for resume to be correct
BATCH_SIZE = 64        # increase on GPU until OOM; decrease if memory error
MAX_LEN = 256          # truncation length

# ----------------------------
# 3) Column names in oc_review.csv
#    Adjust if your file differs
# ----------------------------
COL_REVIEW_ID = "review_id"
COL_BUSINESS_ID = "business_id"
COL_USER_ID = "user_id"
COL_STARS = "stars"
COL_DATE = "date"
COL_TEXT = "text"

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# ----------------------------
# 4) Helpers
# ----------------------------
def softmax(x: np.ndarray) -> np.ndarray:
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e / e.sum(axis=1, keepdims=True)

def chunk_part_path(chunk_idx: int) -> str:
    return os.path.join(PARTS_DIR, f"part_{chunk_idx:06d}.csv")

# ----------------------------
# 5) Load tokenizer + model
# ----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# Label mapping from the model
id2label = {int(k): v.upper() for k, v in model.config.id2label.items()}
prob_cols = [model.config.id2label[i].upper() for i in range(model.config.num_labels)]

print("Model labels:", prob_cols)

# ----------------------------
# 6) RESUME: detect existing part files
# ----------------------------
existing = sorted(glob.glob(os.path.join(PARTS_DIR, "part_*.csv")))
print(f"Existing parts found: {len(existing)}")
if len(existing) > 0:
    print("Example existing part:", existing[0])

# ----------------------------
# 7) Stream CSV, run inference, write parts
#    Resume-safe: skips if part file already exists
# ----------------------------
reader = pd.read_csv(REVIEWS_PATH, chunksize=CHUNK_SIZE)

total_rows = 0
with torch.no_grad():
    for chunk_idx, chunk in enumerate(tqdm(reader, desc="Chunks")):
        out_path = chunk_part_path(chunk_idx)

        # Skip if already processed
        if os.path.exists(out_path):
            continue

        # Normalize IDs to strings (helps consistency later)
        for col in [COL_REVIEW_ID, COL_BUSINESS_ID, COL_USER_ID]:
            if col in chunk.columns:
                chunk[col] = chunk[col].astype(str)

        # Ensure text column exists + clean
        chunk[COL_TEXT] = chunk[COL_TEXT].fillna("").astype(str)

        out_batches = []
        for start in range(0, len(chunk), BATCH_SIZE):
            batch = chunk.iloc[start:start + BATCH_SIZE]
            texts = batch[COL_TEXT].tolist()

            enc = tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt"
            )
            # Move tokenizer outputs to GPU/CPU device
            enc = {k: v.to(DEVICE) for k, v in enc.items()}

            logits = model(**enc).logits.detach().cpu().numpy()
            probs = softmax(logits)

            pred_idx = probs.argmax(axis=1)
            conf = probs.max(axis=1)
            labels = [id2label[int(k)] for k in pred_idx]

            # Build probability dataframe
            prob_df = pd.DataFrame(probs, columns=prob_cols)

            # IMPORTANT:
            # This roberta sentiment model uses: NEGATIVE / NEUTRAL / POSITIVE
            # We standardize to pos/neu/neg columns for your pipeline.
            def get_prob(col_name):
                return prob_df[col_name].values if col_name in prob_df.columns else np.full(len(batch), np.nan)

            out = pd.DataFrame({
                "review_id": batch[COL_REVIEW_ID].values,
                "business_id": batch[COL_BUSINESS_ID].values,
                "user_id": batch[COL_USER_ID].values,
                "stars": batch[COL_STARS].values if COL_STARS in batch.columns else np.nan,
                "date": batch[COL_DATE].values if COL_DATE in batch.columns else None,
                "pos": get_prob("POSITIVE"),
                "neu": get_prob("NEUTRAL"),
                "neg": get_prob("NEGATIVE"),
                "label": labels,
                "conf": conf
            })

            out_batches.append(out)

        scored_chunk = pd.concat(out_batches, ignore_index=True)

        scored_chunk.to_csv(out_path, index=False)
        total_rows += len(scored_chunk)

        print(f"Wrote {out_path} | rows: {len(scored_chunk)}")

print("DONE. Parts are in:", PARTS_DIR)


Mounted at /content/drive
Torch: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model labels: ['NEGATIVE', 'NEUTRAL', 'POSITIVE']


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Existing parts found: 9
Example existing part: /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts/part_000000.csv


Chunks: 0it [00:00, ?it/s]

Wrote /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts/part_000009.csv | rows: 20000
Wrote /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts/part_000010.csv | rows: 20000
Wrote /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts/part_000011.csv | rows: 5242
DONE. Parts are in: /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts


In [13]:
import os, glob
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts"

parts = glob.glob(os.path.join(OUT_DIR, "part_*.csv"))
total_bytes = sum(os.path.getsize(p) for p in parts)
print("num parts:", len(parts))
print("total size (MB):", total_bytes / (1024*1024))


Mounted at /content/drive
num parts: 12
total size (MB): 31.0162410736084


In [14]:
import os, glob
import pandas as pd

PARTS_DIR = "/content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_parts"  # <-- use YOUR working path
OUT_PATH  = "/content/drive/MyDrive/irvine_recommender_outputs/review_sentiment.csv"

parts = sorted(glob.glob(os.path.join(PARTS_DIR, "part_*.csv")))
print("Num parts:", len(parts), "First:", os.path.basename(parts[0]), "Last:", os.path.basename(parts[-1]))

first = True
total_rows = 0

for p in parts:
    df = pd.read_csv(p)
    df.to_csv(OUT_PATH, mode="w" if first else "a", index=False, header=first)
    total_rows += len(df)
    first = False

print("DONE. Saved:", OUT_PATH)
print("Total rows:", total_rows)



Num parts: 12 First: part_000000.csv Last: part_000011.csv
DONE. Saved: /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment.csv
Total rows: 225242


In [15]:
import pandas as pd
df = pd.read_csv(OUT_PATH)
print(df.shape)
print(df.head(3))


(225242, 10)
                review_id             business_id                 user_id  \
0  pUycOfUwM8vqX7KjRRhUEA  gebiRewfieSdtt17PTW6Zg  59MxRhNVhU9MYndMkz0wtw   
1  eCiWBf1CJ0Zdv1uVarEhhw  vC2qm1y3Au5czBtbhc-DNw  OhECKhQEexFypOMY6kypRw   
2  YbMyvlDA2W3Py5lTz8VK-A  bbEXAEFr4RYHLlZ-HFssTA  4hBhtCSgoxkrFgHa4YAD-w   

   stars                 date       pos       neu       neg     label  \
0      3  2016-07-25 07:31:06  0.200358  0.492232  0.307410   NEUTRAL   
1      4  2013-09-04 03:48:20  0.969001  0.025922  0.005077  POSITIVE   
2      5  2017-01-02 03:17:34  0.961576  0.034120  0.004304  POSITIVE   

       conf  
0  0.492232  
1  0.969001  
2  0.961576  


In [16]:
s = df["pos"] + df["neu"] + df["neg"]
print(s.min(), s.max(), s.mean())


0.999999851 1.0000001544 1.0000000000082294


In [20]:
import os
import pandas as pd

OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs"
PATH = os.path.join(OUT_DIR, "review_sentiment.csv")

df = pd.read_csv(PATH)
dt = pd.to_datetime(df["date"], errors="coerce")
hour = dt.dt.hour

def bucket(h):
    if pd.isna(h):
        return "unknown"
    if 5 <= h < 10:
        return "morning"
    if 11 <= h < 16:
        return "lunch"
    return "dinner"  # includes evening/night

df["time_bucket"] = hour.apply(bucket)

out2 = os.path.join(OUT_DIR, "review_sentiment_with_time.csv")
df.to_csv(out2, index=False)
print("Saved:", out2)


Saved: /content/drive/MyDrive/irvine_recommender_outputs/review_sentiment_with_time.csv


In [28]:
import os
import pandas as pd

OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs/sentiment_results"
path = os.path.join(OUT_DIR, "review_sentiment_with_time.csv")

df = pd.read_csv(path, dtype={"business_id": str})

df["time_bucket"] = (
    df["time_bucket"]
    .astype(str)
    .str.strip()
    .str.title()
)

# convert "Nan" back to missing
df.loc[df["time_bucket"] == "Nan", "time_bucket"] = pd.NA


# ----- Sentiment aggregates -----
sent_agg = df.groupby("business_id").agg(
    n_reviews=("review_id", "count"),
    sent_pos_mean=("pos", "mean"),
    sent_neu_mean=("neu", "mean"),
    sent_neg_mean=("neg", "mean"),
    pos_rate=("label", lambda x: (x == "POSITIVE").mean()),
    neu_rate=("label", lambda x: (x == "NEUTRAL").mean()),
    neg_rate=("label", lambda x: (x == "NEGATIVE").mean()),
).reset_index()

# ----- Time bucket counts -----
tb_counts = (
    df.pivot_table(index="business_id", columns="time_bucket", values="review_id",
                   aggfunc="count", fill_value=0)
    .reset_index()
)

# Ensure expected columns exist even if some bucket missing
for col in ["Morning", "Lunch", "Dinner"]:
    if col not in tb_counts.columns:
        tb_counts[col] = 0

tb_counts = tb_counts.rename(columns={
    "Morning": "n_morning_reviews",
    "Lunch": "n_lunch_reviews",
    "Dinner": "n_dinner_reviews"
})

# ----- Merge + rates -----
biz = sent_agg.merge(tb_counts, on="business_id", how="left")

# If some businesses had no time_bucket (NaN), fill counts with 0
for c in ["n_morning_reviews", "n_lunch_reviews", "n_dinner_reviews"]:
    biz[c] = biz[c].fillna(0).astype(int)

# Rates (avoid divide-by-zero)
biz["morning_rate"] = biz["n_morning_reviews"] / biz["n_reviews"]
biz["lunch_rate"]   = biz["n_lunch_reviews"] / biz["n_reviews"]
biz["dinner_rate"]  = biz["n_dinner_reviews"] / biz["n_reviews"]

out_biz = os.path.join(OUT_DIR, "business_features.csv")
biz.to_csv(out_biz, index=False)
print("Saved:", out_biz, "rows:", len(biz))
biz.head()


Saved: /content/drive/MyDrive/irvine_recommender_outputs/sentiment_results/business_features.csv rows: 1349


Unnamed: 0,business_id,n_reviews,sent_pos_mean,sent_neu_mean,sent_neg_mean,pos_rate,neu_rate,neg_rate,n_dinner_reviews,n_lunch_reviews,n_morning_reviews,morning_rate,lunch_rate,dinner_rate
0,--onnLZrsCazmcy2P_7fcw,7,0.673274,0.055072,0.271653,0.714286,0.0,0.285714,5,1,1,0.142857,0.142857,0.714286
1,-3AooxIkg38UyUdlz5oXdw,444,0.566195,0.123231,0.310574,0.596847,0.051802,0.351351,353,22,69,0.155405,0.04955,0.795045
2,-8iATYRnN46Km0_-ldx6cg,166,0.821872,0.08063,0.097498,0.873494,0.018072,0.108434,131,7,28,0.168675,0.042169,0.789157
3,-9r8nAzWyRSLxBWt8uQOdA,343,0.486154,0.132741,0.381105,0.501458,0.055394,0.443149,242,11,90,0.262391,0.03207,0.705539
4,-ALqLSTzkGDMscHdxA1NgA,29,0.830166,0.070254,0.09958,0.862069,0.0,0.137931,23,1,5,0.172414,0.034483,0.793103


In [29]:
import os
import pandas as pd

OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs"

business_path = os.path.join(OUT_DIR, "oc_business.csv")
features_path = os.path.join(OUT_DIR, "sentiment_results/business_features.csv")
out_path      = os.path.join(OUT_DIR, "sentiment_results/oc_business_enriched.csv")

biz = pd.read_csv(business_path, dtype={"business_id": str})
feat = pd.read_csv(features_path, dtype={"business_id": str})

enriched = biz.merge(feat, on="business_id", how="left")
enriched.to_csv(out_path, index=False)

print("Saved:", out_path, "rows:", len(enriched))
enriched.head()


Saved: /content/drive/MyDrive/irvine_recommender_outputs/sentiment_results/oc_business_enriched.csv rows: 1349


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,sent_neg_mean,pos_rate,neu_rate,neg_rate,n_dinner_reviews,n_lunch_reviews,n_morning_reviews,morning_rate,lunch_rate,dinner_rate
0,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,4.0,389,...,0.126287,0.80798,0.047382,0.144638,351,26,24,0.05985,0.064838,0.875312
1,SZU9c8V2GuREDN5KgyHFJw,Santa Barbara Shellfish Company,230 Stearns Wharf,Santa Barbara,CA,93101,34.408715,-119.685019,4.0,2404,...,0.155764,0.756137,0.074877,0.168985,1941,163,340,0.139116,0.066694,0.79419
2,4xhGQGdGqU60BIznBjqnuA,California Tacos and Taproom,956 Embarcadero Del Norte,Isla Vista,CA,93117,34.411555,-119.855077,4.0,49,...,0.146182,0.807692,0.019231,0.173077,41,0,11,0.211538,0.0,0.788462
3,ifjluUv4VASwmFqEp8cWlQ,Marty's Pizza,2733 De La Vina St,Santa Barbara,CA,93105,34.436236,-119.726147,4.0,64,...,0.200939,0.723077,0.061538,0.215385,53,1,11,0.169231,0.015385,0.815385
4,VeFfrEZ4iWaecrQg6Eq4cg,Cal Taco,"7320 Hollister Ave, Ste 1",Goleta,CA,93117,34.430542,-119.882367,4.0,189,...,0.1715,0.723618,0.095477,0.180905,165,15,19,0.095477,0.075377,0.829146


In [30]:
import pandas as pd, os

OUT_DIR = "/content/drive/MyDrive/irvine_recommender_outputs/sentiment_results"
en = pd.read_csv(os.path.join(OUT_DIR, "oc_business_enriched.csv"))

print(en[["business_id","n_reviews","n_morning_reviews","n_lunch_reviews","n_dinner_reviews"]].head(10))

print("Has categories?", "categories" in en.columns)
print("Has time features?", any(c in en.columns for c in ["morning_rate","lunch_rate","dinner_rate"]))
print(en[["business_id","categories","morning_rate","lunch_rate","dinner_rate"]].head())


              business_id  n_reviews  n_morning_reviews  n_lunch_reviews  \
0  IDtLPgUrqorrpqSLdfMhZQ        401                 24               26   
1  SZU9c8V2GuREDN5KgyHFJw       2444                340              163   
2  4xhGQGdGqU60BIznBjqnuA         52                 11                0   
3  ifjluUv4VASwmFqEp8cWlQ         65                 11                1   
4  VeFfrEZ4iWaecrQg6Eq4cg        199                 19               15   
5  bdfZdB2MTXlT6-RBjSIpQg        191                 33                7   
6  xwSWtJcQkzTF6HNm_IMgcg         96                 23                4   
7  -kY_HDP7IMvGl-kBIZVU4A        330                 38               32   
8  18eWJFJbXyR9j_5xfcRLYA        470                 63               34   
9  vLT1KtrA9bWvjFOg-0xVIg        138                 15                3   

   n_dinner_reviews  
0               351  
1              1941  
2                41  
3                53  
4               165  
5               151  
6        