In [None]:
# -------------------------------------------------
# 0. Runtime setup
# -------------------------------------------------
!pip -q install -U sentence-transformers datasets faiss-cpu ipywidgets

import os, warnings, random, time, json, torch, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import faiss
warnings.filterwarnings('ignore')
print('PyTorch:', torch.__version__)
print('GPU available:', torch.cuda.is_available())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m123.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
import pandas as pd

# 1. Mount your Drive
drive.mount('/content/drive')

# 2. Point directly to the files in your Drive
#    (move your CSVs into MyDrive or a subfolder first)
RECIPES_PATH    = '/content/drive/MyDrive/dataset/RAW_recipes.csv'
INTERACTIONS_PATH = '/content/drive/MyDrive/dataset/RAW_interactions.csv'

# 3. Read the real CSVs
recipes   = pd.read_csv(RECIPES_PATH)
interacts = pd.read_csv(INTERACTIONS_PATH)

# 4. Verify
print("recipes columns:", recipes.columns.tolist())
print("interacts columns:", interacts.columns.tolist())
interacts.head

Mounted at /content/drive
recipes columns: ['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']
interacts columns: ['user_id', 'recipe_id', 'date', 'rating', 'review']


In [None]:
# 2. Basic cleaning – build tag strings
def to_list(val):
    if isinstance(val, list): return [str(v) for v in val]
    s = str(val).strip()
    if s.startswith('[') and s.endswith(']'): s = s[1:-1]
    return [x.strip(" '\"") for x in s.split(',') if x.strip()]

def build_tag_string(row):
    return ' '.join(to_list(row.get('ingredients','')) + to_list(row.get('tags','')))

recipes['tag_string'] = recipes.apply(build_tag_string, axis=1)

# adjust these keys to your real column names if different
interact_key = 'recipe_id'
rating_key   = 'rating'
if interact_key not in interacts.columns:
    interact_key = 'id'  # or whatever shows in interacts.columns

avg_rating = interacts.groupby(interact_key)[rating_key].mean().reset_index()
recipes = recipes.merge(avg_rating, how='left', left_on='id', right_on=interact_key)
recipes['rating'] = recipes['rating'].fillna(recipes['rating'].mean())
display(recipes[['name','tag_string','rating']].head())

Unnamed: 0,name,tag_string,rating
0,arriba baked winter squash mexican style,winter squash mexican seasoning mixed spice ho...,5.0
1,a bit different breakfast pizza,prepared pizza crust sausage patty eggs milk s...,3.5
2,all in the kitchen chili,ground beef yellow onions diced tomatoes tomat...,4.0
3,alouette potatoes,spreadable cheese with garlic and herbs new po...,4.5
4,amish tomato ketchup for canning,tomato juice apple cider vinegar sugar salt pe...,5.0


In [None]:
# 3. Prepare train/dev/test pairs
POS_THRESH = 4.0
pairs = [{ 'query': r['tag_string'], 'recipe': r['name'] }
         for _, r in recipes.iterrows() if r['rating'] >= POS_THRESH]
random.shuffle(pairs)
n = len(pairs)
train, dev, test = pairs[:int(0.85*n)], pairs[int(0.85*n):int(0.925*n)], pairs[int(0.925*n):]
print(f"train={len(train)}, dev={len(dev)}, test={len(test)}")

train=165500, dev=14603, test=14604


In [None]:
# 4. DataLoaders
train_ds = [InputExample(texts=[p['query'], p['recipe']]) for p in train]
dev_ds   = [InputExample(texts=[p['query'], p['recipe']]) for p in dev]
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
dev_eval = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_ds, name='dev')

In [None]:
# 5. Load & fine-tune Sentence-BERT
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
loss_fn = losses.MultipleNegativesRankingLoss(model)
warmup = int(len(train_loader) * 1 * 0.1)
model.fit(
    train_objectives=[(train_loader, loss_fn)],
    epochs=1,
    warmup_steps=warmup,
    evaluator=dev_eval,
    evaluation_steps=500,
    output_path='/content/sbert_recipe_model'
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpaladugulaganesh2001[0m ([33mpaladugulaganesh2001-university-at-buffalo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Dev Pearson Cosine,Dev Spearman Cosine
500,0.5378,No log,,
1000,0.2841,No log,,
1500,0.2366,No log,,
2000,0.211,No log,,
2500,0.1865,No log,,
3000,0.179,No log,,
3500,0.1683,No log,,
4000,0.1529,No log,,
4500,0.1513,No log,,
5000,0.1467,No log,,


In [None]:
# 6. Build FAISS index
model = SentenceTransformer('/content/sbert_recipe_model')
titles = recipes['name'].tolist()
embs   = model.encode(titles, batch_size=128, normalize_embeddings=True)
index  = faiss.IndexFlatIP(embs.shape[1])
index.add(embs)
print('FAISS vectors:', index.ntotal)

FAISS vectors: 231637


In [None]:
# 7. Inference helper
def recommend(q, k=5):
    emb = model.encode([q], normalize_embeddings=True)
    scores, idxs = index.search(emb, k)
    return [{
        'recipe_id': int(recipes.iloc[i]['id']),
        'title': recipes.iloc[i]['name'],
        'rating': float(recipes.iloc[i]['rating']),
        'score': float(s)
    } for s,i in zip(scores[0], idxs[0])]
for q in ['chicken basil', 'vegan dessert', 'spicy mexican']:
    print('\n▶', q)
    print(recommend(q))


▶ chicken basil
[{'recipe_id': 331639, 'title': 'chicken holy basil', 'rating': 5.0, 'score': 0.975907564163208}, {'recipe_id': 33412, 'title': 'basil chicken', 'rating': 5.0, 'score': 0.9466419219970703}, {'recipe_id': 5396, 'title': 'holy basil chicken', 'rating': 4.0, 'score': 0.9397380948066711}, {'recipe_id': 460386, 'title': 'spicy basil chicken', 'rating': 5.0, 'score': 0.9310060739517212}, {'recipe_id': 52426, 'title': 'garlic basil chicken', 'rating': 5.0, 'score': 0.9124689698219299}]

▶ vegan dessert
[{'recipe_id': 312362, 'title': 'easy vegan ice cream', 'rating': 5.0, 'score': 0.7277508974075317}, {'recipe_id': 501760, 'title': 'vegan ice cream', 'rating': 0.0, 'score': 0.7225935459136963}, {'recipe_id': 353954, 'title': 'vegan chocolate pudding', 'rating': 4.6, 'score': 0.7114630937576294}, {'recipe_id': 70943, 'title': 'vegan pastry cream', 'rating': 3.75, 'score': 0.7055656909942627}, {'recipe_id': 273078, 'title': 'raw vegan ice cream', 'rating': 5.0, 'score': 0.70191

In [None]:
# 8. Save artifacts to Drive
model.save('/content/drive/MyDrive/final/recipe_semantic_model')
faiss.write_index(index, '/content/drive/MyDrive/final/recipe_index.faiss')
print('Saved model & index!')

Saved model & index!


In [None]:
print(recipes.columns.tolist())

['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']


In [None]:
print(interacts.columns.tolist())

['user_id', 'recipe_id', 'date', 'rating', 'review']


In [None]:
# 2. Load Your CSVs directly from Drive (avoid the virus‐scan HTML issue)
RECIPES_PATH      = '/content/drive/MyDrive/dataset/RAW_recipes.csv'
INTERACTIONS_PATH = '/content/drive/MyDrive/dataset/RAW_interactions.csv'

recipes   = pd.read_csv(RECIPES_PATH)
interacts = pd.read_csv(INTERACTIONS_PATH)

print("Recipes columns:", recipes.columns.tolist())
print("Interactions columns:", interacts.columns.tolist())

# 3. Build tag_string column (ingredients + tags)
def to_list(val):
    if isinstance(val, list):
        return [str(v) for v in val]
    s = str(val).strip()
    if s.startswith('[') and s.endswith(']'):
        s = s[1:-1]
    return [x.strip(" '\"") for x in s.split(',') if x.strip()]

def build_tag_string(row):
    return " ".join(to_list(row.get('ingredients', '')) + to_list(row.get('tags', '')))

recipes['tag_string'] = recipes.apply(build_tag_string, axis=1)

# 4. Merge average rating into recipes
#    Adjust key names if your interactions use a different column for recipe ID
INTERACT_KEY = 'recipe_id' if 'recipe_id' in interacts.columns else 'id'
RATING_COL   = 'rating'

# Compute average rating per recipe
avg_rating = interacts.groupby(INTERACT_KEY)[RATING_COL].mean().reset_index()

# Merge into recipes on recipes.id → interactions.recipe_id (or id)
recipes = recipes.merge(
    avg_rating,
    how='left',
    left_on='id',
    right_on=INTERACT_KEY
)

# If pandas created rating_x/rating_y, rename rating_y to rating and drop extras
if 'rating_x' in recipes.columns and 'rating_y' in recipes.columns:
    recipes = recipes.rename(columns={'rating_y': 'rating'}).drop(columns=['rating_x', INTERACT_KEY])
else:
    recipes['rating'] = recipes['rating'].fillna(recipes[RATING_COL].mean())

# 5. Load the fine-tuned SBERT model and FAISS index
MODEL_PATH = '/content/sbert_recipe_model'
INDEX_PATH = '/content/drive/MyDrive/recipe_index.faiss'

model = SentenceTransformer(MODEL_PATH)
index = faiss.read_index(INDEX_PATH)

# 6. Prepare metadata arrays
titles  = recipes['name'].tolist()
ids     = recipes['id'].tolist()
ratings = recipes['rating'].tolist()

# 7. Inference helper
def recommend(query: str, top_k: int = 5):
    q_emb   = model.encode([query], normalize_embeddings=True)
    scores, idxs = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(scores[0], idxs[0]):
        results.append({
            'recipe_id': int(ids[idx]),
            'title':      titles[idx],
            'rating':     float(ratings[idx]),
            'score':      float(score)
        })
    return results

# 8. Test the model with sample inputs
if __name__ == "__main__":
    sample_queries = [
        "chicken tomato basil quick weeknight",
        "vegan gluten-free chocolate dessert",
        "spicy mexican healthy low-carb"
    ]
    for q in sample_queries:
        print(f"\nQuery ▶ {q}")
        recs = recommend(q, top_k=5)
        for r in recs:
            print(f"  Score: {r['score']:.3f}  |  Title: {r['title']}  |  ★ {r['rating']:.2f}")