In [1]:
import pandas as pd

In [2]:
df_recipes = pd.read_csv('../Datasets/RAW_recipes.csv')
df_interactions = pd.read_csv('../Datasets/RAW_interactions.csv')

In [3]:
df_grouped = df_interactions.groupby('recipe_id', group_keys=True)
df_ratings = df_grouped.agg({
    'recipe_id':'last',
    'rating':'mean'
})
df_ratings = df_ratings.reset_index(drop=True)
df_ratings = df_ratings.rename(columns={"recipe_id":"id", "rating":"score"})
df_merged = pd.merge(df_recipes, df_ratings, on='id')
df_merged['difficulty'] = df_merged.n_steps * df_merged.minutes
bins = [df_merged['difficulty'].min(), 150, 350, 750, df_merged['difficulty'].max()]
labels = ['easy', 'medium', 'hard', 'very hard']
df_merged['difficulty'] = pd.cut(df_merged['difficulty'], bins=bins, labels=labels)
df_merged = df_merged.drop(df_merged.index[[3381]]) #Error in line formatting

In [4]:
import re
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent

cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
 
set_mapping_table = str.maketrans({'[': '{', ']': '}'})
description_mapping_table = str.maketrans({"'": ""})
apostrophe_mapping_table = str.maketrans({"'": "''"})

statements_and_params = []

query_insert_recipes_by_date_submitted = session.prepare("INSERT INTO recipe.recipes_by_date_submitted (date_submitted,score,id,name) VALUES (?,?,?,?)")
#insert into recipes_by_name not needed since it is a materialized view, so it has the data from recipes_by_date_submitted
query_insert_recipes_by_difficulty = session.prepare("INSERT INTO recipe.recipes_by_difficulty (difficulty,score,id,name,date_submitted) VALUES (?,?,?,?,?)")
query_insert_recipes = session.prepare("INSERT INTO recipe.recipes (name,id,minutes,contributor_id,score,date_submitted,tags,nutrition,steps,number_of_steps,description,ingredients,number_of_ingredients,difficulty) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)")
query_insert_recipes_by_tag = session.prepare("INSERT INTO recipe.recipes_by_tag (tags,date_submitted,score,id,name,minutes) VALUES (?,?,?,?,?,?)")

data_insert_recipes_by_date_submitted = []
data_insert_recipes_by_difficulty = []
data_insert_recipes = []
data_insert_recipes_by_tag = []

def convertToListOfFloat(text):
    result = []
    text = text.strip('[' + ']')
    for kv in re.split(',|\*|\n', text):
        kv = kv.strip()
        if(len(kv) == 0):
            continue
        result.append(float(kv))
    return result

def convertToSetOrListOfText(text, typeSelector):
    result = []
    if typeSelector == 'set':
        beginning = '{'
        end = '}'
        text = text.translate(set_mapping_table)
    else:
        beginning = '['
        end = ']'
    text = text.strip(beginning + end)
    for kv in re.split('\', |\", |\*|\n', text):
        kv = kv.strip()
        if(len(kv) == 0):
            continue
        elif(kv[0] == '\"'):
            kv = kv.strip("\"")
        elif(kv[0] == '\''):
            kv = kv.strip("\'")
        kv = kv.translate(apostrophe_mapping_table)
        result.append(kv)
    return result
 
for index, row in df_merged.iterrows():
        statements_and_params.append((query_insert_recipes_by_date_submitted, (str(row['submitted']),row['score'],row['id'],str(row['name']))))
        statements_and_params.append((query_insert_recipes_by_difficulty, (str(row['difficulty']),row['score'],row['id'],str(row['name']),str(row['submitted']))))
        statements_and_params.append((query_insert_recipes, (str(row['name']),row['id'],row['minutes'],row['contributor_id'],row['score'],str(row['submitted']),convertToSetOrListOfText(str(row['tags']), 'set'),convertToListOfFloat(str(row['nutrition'])),convertToSetOrListOfText(str(row['steps']), 'list'),row['n_steps'],str(row['description']).translate(description_mapping_table),convertToSetOrListOfText(str(row['ingredients']), 'set'),row['n_ingredients'],str(row['difficulty']))))
        statements_and_params.append((query_insert_recipes_by_tag, (convertToSetOrListOfText(str(row['tags']), 'set'),str(row['submitted']),row['score'],row['id'],str(row['name']),row['minutes'])))

In [5]:
import time

starting_time = time.time()
    
results = execute_concurrent(
    session, statements_and_params, raise_on_first_error=True)

elapsed_time = time.time() - starting_time
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
cluster.shutdown()

Execution time: 00:03:45
