In [1]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'recipe-recommendation-2024'

In [10]:
# !pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=1442972 sha256=c1ba5fe84a0db14dd163534155f85cedb8f8fcbe2131cdecf4df985217818805
  Stored in directory: /home/jupyter/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import kfp

from kfp.v2 import dsl
from kfp.v2.dsl import pipeline
from kfp.v2.dsl import component
from kfp.v2.dsl import OutputPath
from kfp.v2.dsl import InputPath


from kfp.v2.dsl import Output
from kfp.v2.dsl import Metrics

from kfp.v2 import compiler

from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component,
                        Markdown)

from google.cloud import aiplatform
from google.cloud import bigquery
from google.cloud.aiplatform import pipeline_jobs

  from kfp.v2 import dsl


### Get Data Component (getData.py)

In [3]:
# complete the coding logit first, and then convert it to pipeline components

def get_interactions_data():
    # install pandas, google-cloud-bigquery
    import pandas as pd
    from google.cloud import bigquery
    
    client = bigquery.Client(project=PROJECT_ID)
    
    query_interactions_sql = """
    SELECT user_id, recipe_id, rating 
    FROM `recipe-recommendation-2024.RecipeQuery.interactions`
    """
    
    query_job = client.query(query_interactions_sql)
    
    data = query_job.to_dataframe()
    
    return data
    
def get_recipes_data():
    # install pandas, google-cloud-bigquery
    
    import pandas as pd
    from google.cloud import bigquery
    
    client = bigquery.Client(project=PROJECT_ID)
    
    query_interactions_sql = """
    SELECT name,minutes,tags,nutrition,n_steps,ingredients,n_ingredients,description
    FROM `recipe-recommendation-2024.RecipeQuery.recipes`
    """
    
    query_job = client.query(query_interactions_sql)
    
    data = query_job.to_dataframe()
    
    return data
    

### Data Preprocessing + Feature Engineering (DataPreprocessing.py)

In [9]:
# import getData

# raw_interactions_data = getData.get_interactions_data()
# raw_recipes_data = getData.get_recipes_data()

raw_interactions = get_interactions_data()
raw_recipes = get_recipes_data()

def process_interactions_data(raw_interactions):
    raw_nr = len(raw_interactions.recipe_id.unique())
    raw_nu = len(raw_interactions.user_id.unique())
    
    # Step 1: Group by 'user_id' and count unique 'recipe_id' occurrences
    user_recipe_frequency = raw_interactions.groupby('user_id')['recipe_id'].nunique()
    user_recipe_frequency = user_recipe_frequency.sort_values(ascending=False)
    recipe_item_frequency = raw_interactions.groupby('recipe_id')['user_id'].nunique()
    recipe_item_frequency.sort_values(ascending=False,inplace=True)
    
    # Step 2: Set the minimum threshold. We only keep user_ids that have rated at least 0.5% of the recipes
    minimum_percent_nr = 0.005 #change this value to suit preference
    minimum_threshold_nr = minimum_percent_nr*raw_nr
    
    
    # Step 3: Filter user_ids based on the miniumum threshold
    selected_user_ids = user_recipe_frequency[user_recipe_frequency >= minimum_threshold_nr].index.tolist()

    # Step 4: Set the minimum threshold. We only keep recipe_ids that have been rated by at least 0.5% of the users
    minimum_percent_nu = 0.005
    minimum_threshold_nu = minimum_percent_nu*raw_nu
    selected_item_ids = recipe_item_frequency[recipe_item_frequency >= minimum_threshold_nu].index.tolist()

    # Step 5: Filter rows in raw_interactions where user_id is in selected_user_ids
    selected_interactions = raw_interactions[(raw_interactions['user_id'].isin(selected_user_ids)) & (raw_interactions['recipe_id'].isin(selected_item_ids))]
    return selected_interactions

def process_recipes_data(raw_recipes):
    # install pandas, ast
    import pandas as pd
    import ast
    # The original file is too big, sample 10% for testing
    X = raw_recipes.dropna().sample(frac=0.1).reset_index()
    
    # change the tags & nutrition & ingredients into list
    for col in ['tags','nutrition','ingredients']:
        X[col] = X.apply(lambda x:ast.literal_eval(x[col]),axis=1)
        
    # transfer the nutrition into dic
    keys = ['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']

    X['nutrition'] = X.apply(lambda x:dict(zip(keys,x['nutrition'])),axis=1)
    
    # Feature Engineering
    X = pd.concat([X,X['nutrition'].apply(pd.Series)],axis=1)
    X.pop('nutrition')
    
    X = pd.concat([X,X['tags'].str.join('|').str.get_dummies()],axis=1)
    X.pop('tags')
    
    X = pd.concat([X,X['ingredients'].str.join('|').str.get_dummies()],axis=1)
    X.pop('ingredients')
    
    X.pop('index')
    
    return X
        

Unnamed: 0,name,minutes,n_steps,n_ingredients,description,calories,total fat,sugar,sodium,protein,...,yukon gold potato,yukon gold potatoes,za'atar spice mix,zatarain cajun seasoning,zinfandel,ziploc bag,ziploc bags,ziti pasta,zucchini,zucchini with italian-style tomato sauce
0,a smoked beef brisket with rub and a texas bb...,10,30,27,the flavors in the rub are oil soluble and the...,541.5,40.0,109.0,69.0,79.0,...,0,0,0,0,0,0,0,0,0,0
1,new ginger wine christmas holiday ginger wine,1445,6,7,"i haven't had traditional ginger wine, but i d...",231.3,0.0,97.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,beau monde seasoning replacement,5,1,5,the beau monde seasoning that i've used has a ...,3.3,0.0,0.0,24.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,mushroom white bean and leek ragot,25,11,9,vegetarian times. april 2007. i would top mine...,355.4,2.0,15.0,2.0,50.0,...,0,0,0,0,0,0,0,0,0,0
4,mint chocolate cake with mint cream cheese ici...,50,8,11,i got this recipe from group recipes and it wa...,862.4,70.0,347.0,16.0,19.0,...,0,0,0,0,0,0,0,0,0,0


### Training model and save model to GCP Bucket path (ModelTraining.py)

In [None]:

processed_interactions = process_interactions_data(raw_interactions_data)
processed_recipes = process_recipes_data(raw_recipes_data)

def CF_training(data):
    # install pandas, scikit-surprise, pickle, google-cloud-storage

    from surprise.model_selection import train_test_split
    from surprise import KNNBasic
    from surprise import Dataset
    from surprise import Reader
    from surprise import accuracy
    
    import pickle
    import pandas as pd
    
    from google.cloud import storage
    
    # convert to surprise format
    reader = Reader(rating_scale=(0,5))
    data = Dataset.load_from_df(data, reader)  # assumes dataframe contains: user, item, ratings (in this order)
    

    # find parameters that give the best modelling accuracy
    for n in range(3):
      trainset, testset  = train_test_split(data, test_size=0.2,random_state=1) # should set a random seed, adn in some scenario, it will occur error.
      minmae = 1000
      minprm = ()
      for sim in ['MSD','pearson']:
        for k in [1, 5, 10, 20]: #note: setting k too high might result in float division error for cosine, due to sparse dataset
          algo = KNNBasic(k=k, sim_options={'name': sim, 'user_based': True}, verbose=False) #use user-based CF as there are a lot more unique items than unique users
          algo.fit(trainset)
          preds = algo.test(testset)
          mae = accuracy.mae(preds, verbose=False)
          #print(n, userbased, sim, 'k=', k, 'mae=', mae)
          if mae < minmae:
            minmae = mae
            minprm = (sim, k)
    
    # train the model based on the best params
    algo = KNNBasic(k=minprm[1], sim_options={'name': minprm[0], 'user_based': True }, verbose=False)
    algo.fit(trainset)
    print("Finish Training CF model")
    model_pickle = pickle.dumps(algo)

    # upload the model pickle file to GCS
    pickle_file_name = "CF_knn_model.pkl"
    bucket_name = "model_rcs"
    destination_blob_name = f'{pickle_file_name}'
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(folder_path)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(model_pickle)
    print(f"Successfully upload CF model into GCS Bucket. Path:{bucket_name}/{pickle_file_name}")
    

# KNN Clustering
def CB_training_1(X):
    # install scikit-learn, pickle
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.neighbors import NearestNeighbors
    import sklearn.preprocessing as pp
    
    import pickle
    
    scaler = pp.StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # KNN
    knn = NearestNeighbors(n_neighbors=5,algorithm='ball_tree').fit(X_scaled)
    print("Finish Training CB KNN model")
    
    model_pickle = pickle.dumps(knn)
    
    # upload the model pickle file to GCS
    pickle_file_name = "CB_knn_model.pkl"
    bucket_name = "model_rcs"
    destination_blob_name = f'{pickle_file_name}'
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(folder_path)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(model_pickle)
    print(f"Successfully upload CF model into GCS Bucket. Path:{bucket_name}/{pickle_file_name}")
    
    
    
# CF_training(processed_interactions)
X_1 = processed_recipes.copy()
X_1.pop('name')
X_1.pop('description')
X_2 = processed_recipes[['name','description']]
CB_training_1(X_1)
    
    