In [None]:
# Cell 0
!pip install -q sentence-transformers faiss-cpu openpyxl pandas scikit-learn

# If you want GPU and plan to fine-tune, use the GPU runtime (Runtime > Change runtime type)
# and install faiss-gpu instead: !pip install faiss-gpu

import os, re, math
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Edit these paths if needed (or mount Drive)
input_path1 = "/content/cholesterol.xlsx"   # diabetes meal dataset
input_path2 = "/content/yoga.xlsx"          # yoga dataset
output_reco_path = "meal_to_yoga_top1_contrastive_Cholesterol.xlsx"

print("libraries loaded")


libraries loaded


In [None]:
# Cell 1

df_meal = pd.read_excel(input_path1, engine="openpyxl")
df_yoga = pd.read_excel(input_path2, engine="openpyxl")

print("Meal columns:", df_meal.columns.tolist())
print("Yoga columns:", df_yoga.columns.tolist())
print("Meal rows:", len(df_meal), "Yoga rows:", len(df_yoga))

# show sample rows
display(df_meal.head(2))
display(df_yoga.head(2))

Meal columns: ['Plan', 'Meal_Day', 'Meal_Type', 'Meal_Name', 'Meal_Description', 'Calories (kcal)', 'Zinc (mg)', 'Copper (mg)', 'Iron (mg)', 'Chromium (µg)', 'Potassium (mg)', 'Phosphorus (mg)', 'Vitamin_B1 (Thiamin, mg)', 'Vitamin_B2 (Riboflavin, mg)', 'Vitamin_B3 (Niacin, mg)', 'Vitamin_B6 (mg)', 'Vitamin_B9 (Folate, µg)', 'Vitamin_B12 (µg)', 'Selenium (µg)', 'Magnesium (mg)']
Yoga columns: ['Plan', 'Disease', 'Day', 'Time', 'Yoga (Detailed)', 'Exercise (Detailed)', 'Precautions (Detailed)']
Meal rows: 42 Yoga rows: 126


Unnamed: 0,Plan,Meal_Day,Meal_Type,Meal_Name,Meal_Description,Calories (kcal),Zinc (mg),Copper (mg),Iron (mg),Chromium (µg),Potassium (mg),Phosphorus (mg),"Vitamin_B1 (Thiamin, mg)","Vitamin_B2 (Riboflavin, mg)","Vitamin_B3 (Niacin, mg)",Vitamin_B6 (mg),"Vitamin_B9 (Folate, µg)",Vitamin_B12 (µg),Selenium (µg),Magnesium (mg)
0,Plan A,Monday,Breakfast,Oatmeal with Walnuts,"Steel-cut oats with walnuts, flaxseed, and ban...",500.0,2.5,0.4,2.5,8.0,900,250,0.3,0.2,2.0,0.4,80.0,0.1,15,110
1,Plan A,Monday,Lunch,Spinach & Chicken Salad,"Large spinach salad with grilled chicken, chic...",700.0,3.0,0.3,5.5,10.0,1300,450,0.4,0.4,18.0,0.8,150.0,1.2,20,120


Unnamed: 0,Plan,Disease,Day,Time,Yoga (Detailed),Exercise (Detailed),Precautions (Detailed)
0,Plan A,Asthma,Monday,Morning,Sukhasana (Easy Pose) with diaphragmatic breat...,25 minutes of indoor brisk walking at 3-4 km/h...,Conduct session in a well-ventilated indoor ar...
1,Plan A,Asthma,Monday,Afternoon,Seated Parvatasana (Mountain Pose) with interc...,15 minutes of light upper body flexibility exe...,Monitor for any signs of bronchoconstriction; ...


In [None]:
# Cell 2
# robust day extractor
days_pattern = r'\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Tues|Wed|Thu|Thur|Fri|Sat|Sun|Day\s*\d{1,2})\b'

def extract_day_from_meal_day(x):
    if pd.isna(x): return pd.NA
    s = str(x)
    m = re.search(days_pattern, s, flags=re.IGNORECASE)
    if m:
        tok = m.group(1)
        if re.match(r'(?i)^day\s*\d', tok):
            return re.sub(r'\s+',' ', tok.title())
        return tok.title()
    return pd.NA

def mealtype_to_time(mt):
    if pd.isna(mt): return pd.NA
    s = str(mt).strip().lower()
    if any(k in s for k in ['breakfast','morning']):
        return 'Morning'
    if any(k in s for k in ['lunch','afternoon']):
        return 'Afternoon'
    if any(k in s for k in ['dinner','supper','evening']):
        return 'Evening'
    if any(k in s for k in ['snack']):
        return 'Anytime'
    return str(mt).title()

# Meal dataset: Day, Time
df_meal['Day'] = df_meal['Meal_Day'].apply(extract_day_from_meal_day)
df_meal['Time'] = df_meal['Meal_Type'].apply(mealtype_to_time)

# Yoga dataset normalization
df_yoga['Day'] = df_yoga['Day'].astype(str).str.strip().replace({'nan': pd.NA})
df_yoga['Time'] = df_yoga['Time'].astype(str).str.strip().replace({'nan': pd.NA})

# Standardize a few common tokens
df_yoga['Day'] = df_yoga['Day'].replace({r'^(mon|monday)$': 'Monday', r'^(tue|tues|tuesday)$': 'Tuesday'}, regex=True)

# Filter for asthma only (for recommendations)
df_yoga_chol = df_yoga[df_yoga['Disease'].astype(str).str.lower().str.contains('cholesterol', na=False)].copy()

print("After filter: Yoga (diabetes) rows:", len(df_yoga_chol))
display(df_meal[['Meal_Day','Day','Meal_Type','Time']].head(4))
display(df_yoga_chol[['Disease','Day','Time']].head(4))
df_yoga_filtered = df_yoga_chol

After filter: Yoga (diabetes) rows: 41


Unnamed: 0,Meal_Day,Day,Meal_Type,Time
0,Monday,Monday,Breakfast,Morning
1,Monday,Monday,Lunch,Afternoon
2,Monday,Monday,Dinner,Evening
3,Tuesday,Tuesday,Breakfast,Morning


Unnamed: 0,Disease,Day,Time
42,Hypercholesterolemia,Monday,Morning
43,Hypercholesterolemia,Monday,Afternoon
44,Hypercholesterolemia,Monday,Evening
45,Hypercholesterolemia,Tuesday,Morning


In [None]:
# Cell 3
# Cell 3 (edit)
nutrient_cols = [
    'Calories (kcal)','Total_Fat (g)','Protein (g)','Carbohydrates (g)',
    'Dietary_Fiber (g)','Added_Sugar (g)',
    'Vitamin_C (mg)','Vitamin_E (mg)','Vitamin_D (µg)',
    'Magnesium (mg)','Selenium (µg)',
    'Omega-3 (ALA, g)','EPA + DHA (mg)','Zinc (mg)'
]


def meal_context_text(row):
    parts = []
    if pd.notna(row.get('Plan')): parts.append(f"Plan: {row['Plan']}")
    if pd.notna(row.get('Day')): parts.append(f"Day: {row['Day']}")
    if pd.notna(row.get('Time')): parts.append(f"Time: {row['Time']}")
    if pd.notna(row.get('Meal_Name')): parts.append(f"Meal: {row['Meal_Name']}")
    if pd.notna(row.get('Meal_Description')): parts.append(f"Desc: {row['Meal_Description']}")
    # include short nutrient summary (only include non-null)
    nut_list = []
    for c in nutrient_cols:
        if c in row and pd.notna(row[c]):
            nut_list.append(f"{c.split('(')[0].strip()}:{row[c]}")
    if nut_list:
        parts.append("Nutrition: " + ", ".join(nut_list))
    return " | ".join(parts)

def yoga_context_text(row):
    parts = []
    if pd.notna(row.get('Plan')): parts.append(f"Plan: {row['Plan']}")
    if pd.notna(row.get('Day')): parts.append(f"Day: {row['Day']}")
    if pd.notna(row.get('Time')): parts.append(f"Time: {row['Time']}")
    if pd.notna(row.get('Yoga (Detailed)')): parts.append(f"Yoga: {row['Yoga (Detailed)']}")
    if pd.notna(row.get('Exercise (Detailed)')): parts.append(f"Exercise: {row['Exercise (Detailed)']}")
    if pd.notna(row.get('Precautions (Detailed)')): parts.append(f"Precautions: {row['Precautions (Detailed)']}")
    return " | ".join(parts)

df_meal['context_text'] = df_meal.apply(meal_context_text, axis=1)
df_yoga_filtered['context_text'] = df_yoga_filtered.apply(yoga_context_text, axis=1)

# keep original indices for traceability
df_meal = df_meal.reset_index(drop=False).rename(columns={'index':'meal_orig_index'})
df_yoga_filtered = df_yoga_filtered.reset_index(drop=False).rename(columns={'index':'yoga_orig_index'})

print("Sample meal context:")
display(df_meal[['meal_orig_index','Meal_Name','context_text']].head(3))
print("Sample yoga context:")
display(df_yoga_filtered[['yoga_orig_index','Yoga (Detailed)','context_text']].head(3))


Sample meal context:


Unnamed: 0,meal_orig_index,Meal_Name,context_text
0,0,Oatmeal with Walnuts,Plan: Plan A | Day: Monday | Time: Morning | M...
1,1,Spinach & Chicken Salad,Plan: Plan A | Day: Monday | Time: Afternoon |...
2,2,Baked Salmon & Sweet Potato,Plan: Plan A | Day: Monday | Time: Evening | M...


Sample yoga context:


Unnamed: 0,yoga_orig_index,Yoga (Detailed),context_text
0,42,"Surya Namaskar (Sun Salutation), 5 rounds at a...",Plan: Plan A | Day: Monday | Time: Morning | Y...
1,43,"Trikonasana (Triangle Pose), held for 30 secon...",Plan: Plan A | Day: Monday | Time: Afternoon |...
2,44,"Setu Bandhasana (Bridge Pose), 3 repetitions h...",Plan: Plan A | Day: Monday | Time: Evening | Y...


In [None]:
# Cell 4
def same_day_time(mday, yday, mtime, ytime):
    # conservatively require exact match (case-insensitive), but allow NA as wildcard
    def eq(a,b):
        if pd.isna(a) or pd.isna(b): return False
        return str(a).strip().lower() == str(b).strip().lower()
    return eq(mday,yday) and eq(mtime,ytime)

# Build candidate pos pairs by Day+Time equality
pos_pairs = []
for _, mrow in df_meal.iterrows():
    mday, mtime = mrow['Day'], mrow['Time']
    # find yoga rows same day & time
    candidates = df_yoga_filtered[df_yoga_filtered.apply(lambda r: same_day_time(mday, r['Day'], mtime, r['Time']), axis=1)]
    for _, yrow in candidates.iterrows():
        pos_pairs.append((mrow['context_text'], yrow['context_text'], mrow['meal_orig_index'], yrow['yoga_orig_index']))

print("Positive pairs found:", len(pos_pairs))
# show examples
pos_pairs[:3]


Positive pairs found: 82


[('Plan: Plan A | Day: Monday | Time: Morning | Meal: Oatmeal with Walnuts | Desc: Steel-cut oats with walnuts, flaxseed, and banana slices. | Nutrition: Calories:500.0, Magnesium:110, Selenium:15, Zinc:2.5',
  'Plan: Plan A | Day: Monday | Time: Morning | Yoga: Surya Namaskar (Sun Salutation), 5 rounds at a moderate, flowing pace to elevate heart rate and warm up the body. Followed by 5 minutes of Tadasana (Mountain Pose) to center | Exercise: 30 minutes of stationary cycling (moderate intensity, Level 5-6) to improve cardiovascular endurance and stimulate lipid metabolism. 5-minute warm-up and cool-down | Precautions: Ensure a plant-forward breakfast (e.g., oatmeal) is consumed 60-90 minutes prior. Avoid high saturated fat intake (e.g., bacon, butter).',
  0,
  42),
 ('Plan: Plan A | Day: Monday | Time: Morning | Meal: Oatmeal with Walnuts | Desc: Steel-cut oats with walnuts, flaxseed, and banana slices. | Nutrition: Calories:500.0, Magnesium:110, Selenium:15, Zinc:2.5',
  'Plan: Pla

In [None]:
# Cell 5
from sentence_transformers import InputExample

# Tune these params
model_name = "all-mpnet-base-v2"   # base model; can swap
max_pairs_for_training = 20000     # cap to avoid huge training; adjust
batch_size = 16
num_epochs = 2
warmup_steps = 100

# Create InputExample list (texts=[anchor, positive])
train_examples = []
for i,(mtext, ytext, midx, yidx) in enumerate(pos_pairs):
    if i >= max_pairs_for_training: break
    if not mtext or not ytext: continue
    train_examples.append(InputExample(texts=[mtext, ytext]))

print("Train examples:", len(train_examples))
if len(train_examples) < 20:
    print("WARNING: fewer than 20 positive pairs — contrastive fine-tuning will be limited. Consider relaxing positive rules or augmenting data.")


Train examples: 82


In [None]:
# Cell 6
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, evaluation

# Load model (two-tower pooling is default)
model = SentenceTransformer(model_name)

if len(train_examples) >= 2:
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    # optional: small evaluator can be added if you have dev set
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=num_epochs,
              warmup_steps=warmup_steps,
              show_progress_bar=True)
    # save fine-tuned model
    ft_model_path = "sbert_meal_yoga_contrastive"
    model.save(ft_model_path)
    print("Saved fine-tuned model to", ft_model_path)
else:
    print("Skipping fine-tuning due to insufficient pairs; using base model.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkshitizsikriwal16[0m ([33mkshitizsikriwal16-central-university-of-haryana[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


In [None]:
# Cell 7
# If fine-tuned, reload it to be explicit
if os.path.exists("sbert_meal_yoga_contrastive"):
    model = SentenceTransformer("sbert_meal_yoga_contrastive")
else:
    model = SentenceTransformer(model_name)

meal_texts = df_meal['context_text'].fillna('').tolist()
yoga_texts = df_yoga_filtered['context_text'].fillna('').tolist()

# Normalize embeddings for cosine similarity (set normalize_embeddings=True)
meal_embeddings = model.encode(meal_texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
yoga_embeddings = model.encode(yoga_texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

print("Emb shapes:", meal_embeddings.shape, yoga_embeddings.shape)


In [None]:
# Cell 8
K = 5  # compute top-K internally (we will take top-1), K>1 helps in case top-1 violates business filter
nn = NearestNeighbors(n_neighbors=K, metric='cosine').fit(yoga_embeddings)
distances, indices = nn.kneighbors(meal_embeddings)  # distances are cosine distances in [0,2]
# similarity = 1 - distance (since sklearn cosine distance = 1 - cosine_similarity)
sims = 1 - distances

results = []
for meal_idx in range(len(meal_texts)):
    # iterate candidate yoga matches sorted by similarity
    selected = None
    for rank in range(K):
        y_idx = indices[meal_idx, rank]
        sim_score = float(sims[meal_idx, rank])
        # optional business filters: require same Day and Time OR allow if missing
        mday = df_meal.loc[meal_idx, 'Day']
        mtime = df_meal.loc[meal_idx, 'Time']
        yday = df_yoga_filtered.loc[y_idx, 'Day']
        ytime = df_yoga_filtered.loc[y_idx, 'Time']
        # decide acceptance rule:
        # Accept if (Day matches OR Day is NA in either) AND (Time matches OR Time is NA in either)
        def eq_or_na(a,b):
            if pd.isna(a) or pd.isna(b): return True
            return str(a).strip().lower() == str(b).strip().lower()
        if eq_or_na(mday, yday) and eq_or_na(mtime, ytime):
            selected = (y_idx, sim_score)
            break
    # fallback: if still None, choose top candidate regardless
    if selected is None:
        y_idx = indices[meal_idx, 0]
        selected = (y_idx, float(sims[meal_idx, 0]))
    y_idx, sim_score = selected
    # collect merged info
    meal_row = df_meal.loc[meal_idx]
    y_row = df_yoga_filtered.loc[y_idx]
    merged = {
        'meal_index': int(meal_row['meal_orig_index']),
        'meal_plan': meal_row.get('Plan'),
        'meal_day': meal_row.get('Day'),
        'meal_time': meal_row.get('Time'),
        'meal_name': meal_row.get('Meal_Name'),
        'meal_description': meal_row.get('Meal_Description'),
        # include nutrients (if exist)
    }
    # attach nutrients
    for c in nutrient_cols:
        merged[c] = meal_row[c] if c in meal_row else np.nan
    # attach yoga fields
    merged.update({
        'yoga_index': int(y_row['yoga_orig_index']),
        'yoga_plan': y_row.get('Plan'),
        'yoga_day': y_row.get('Day'),
        'yoga_time': y_row.get('Time'),
        'yoga_detailed': y_row.get('Yoga (Detailed)'),
        'yoga_exercise': y_row.get('Exercise (Detailed)'),
        'yoga_precautions': y_row.get('Precautions (Detailed)'),
        'similarity': sim_score
    })
    results.append(merged)

df_final = pd.DataFrame(results)
df_final.to_excel(output_reco_path, index=False, engine="openpyxl")
print("Saved final mapping to", output_reco_path)
display(df_final.head(8))


In [None]:
# Cell 9
threshold = 0.55
df_final_filtered = df_final[df_final['similarity'] >= threshold].copy()
print("Total recommended (>=%.2f): %d / %d" % (threshold, len(df_final_filtered), len(df_final)))
df_final_filtered.to_excel("meal_to_yoga_top1_thresholded.xlsx", index=False, engine="openpyxl")
display(df_final_filtered.head(8))


In [None]:
# Cell 10
df_for_labeling = df_final[['meal_index','meal_name','meal_description','yoga_index','yoga_detailed','yoga_exercise','yoga_precautions','similarity']]
df_for_labeling.to_csv("meal_yoga_for_human_labeling.csv", index=False)
print("Saved meal_yoga_for_human_labeling.csv — annotate with 1=good / 0=bad in a 'label' column, then re-upload for automatic evaluation.")
