In [7]:
import pandas as pd

In [8]:
df_recipes = pd.read_csv('../../Datasets/RAW_recipes.csv')
df_interactions = pd.read_csv('../../Datasets/RAW_interactions.csv')

In [23]:
df_grouped = df_interactions.groupby('recipe_id', group_keys=True)
df_ratings = df_grouped.agg({
    'recipe_id':'last',
    'rating':'mean'
})
df_ratings = df_ratings.reset_index(drop=True)
df_ratings = df_ratings.rename(columns={"recipe_id":"id", "rating":"score"})
df_merged = pd.merge(df_recipes, df_ratings, on='id')
df_merged['difficulty'] = df_merged.n_steps * df_merged.minutes
bins = [df_merged['difficulty'].min(), 150, 350, 750, df_merged['difficulty'].max()]
labels = ['easy', 'medium', 'hard', 'very hard']
df_merged['difficulty'] = pd.cut(df_merged['difficulty'], bins=bins, labels=labels)
df_merged = df_merged.drop(df_merged.index[[3381]]) #Error in line formatting

In [24]:
df_merged = df_merged.head(30000)

In [25]:
import re
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent
from cassandra import ConsistencyLevel

cluster = Cluster(['192.168.1.201', '192.168.1.202', '192.168.1.203', '192.168.1.204', '192.168.1.205', '192.168.1.206'])
session = cluster.connect()

In [26]:
set_mapping_table = str.maketrans({'[': '{', ']': '}'})
description_mapping_table = str.maketrans({"'": ""})
apostrophe_mapping_table = str.maketrans({"'": "''"})

query_insert_recipes_by_month_submitted = session.prepare("INSERT INTO recipe.recipes_by_month_submitted (month_submitted,score,id,name) VALUES (?,?,?,?)")
query_insert_recipes_by_month_submitted.consistency_level = ConsistencyLevel.QUORUM
query_insert_recipes_by_difficulty = session.prepare("INSERT INTO recipe.recipes_by_difficulty (difficulty,score,id,name) VALUES (?,?,?,?)")
query_insert_recipes_by_difficulty.consistency_level = ConsistencyLevel.QUORUM
query_insert_recipes = session.prepare("INSERT INTO recipe.recipes (name,tags,date_submitted,score,id,minutes,contributor_id,nutrition,steps,number_of_steps,description,ingredients,number_of_ingredients,difficulty) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)")
query_insert_recipes.consistency_level = ConsistencyLevel.QUORUM
query_insert_recipes_by_tag = session.prepare("INSERT INTO recipe.recipes_by_tag (tag,date_submitted,id,score,name) VALUES (?,?,?,?,?)")
query_insert_recipes_by_tag.consistency_level = ConsistencyLevel.QUORUM
#insert into recipes_by_tag_by_score not needed since it is a materialized view, so it has the data from recipes_by_score

data_insert_recipes_by_month_submitted = []
data_insert_recipes_by_difficulty = []
data_insert_recipes = []
data_insert_recipes_by_tag = []

def convertToListOfFloat(text):
    result = []
    text = text.strip('[' + ']')
    for kv in re.split(',|\*|\n', text):
        kv = kv.strip()
        if(len(kv) == 0):
            continue
        result.append(float(kv))
    return result

def convertToSetOrListOfText(text, typeSelector):
    result = []
    if typeSelector == 'set':
        beginning = '{'
        end = '}'
        text = text.translate(set_mapping_table)
    else:
        beginning = '['
        end = ']'
    text = text.strip(beginning + end)
    for kv in re.split('\', |\", |\*|\n', text):
        kv = kv.strip()
        if(len(kv) == 0):
            continue
        elif(kv[0] == '\"'):
            kv = kv.strip("\"")
        elif(kv[0] == '\''):
            kv = kv.strip("\'")
        kv = kv.translate(apostrophe_mapping_table)
        if(result != ''):
            result.append(kv)
    return result
 
i = 0;
for index, row in df_merged.iterrows():
        tags = convertToSetOrListOfText(str(row['tags']), 'set')
        ingredients = convertToSetOrListOfText(str(row['ingredients']), 'set')
        steps = convertToSetOrListOfText(str(row['steps']), 'list')
        nutrition = convertToListOfFloat(str(row['nutrition']))
        data_insert_recipes_by_month_submitted.append((query_insert_recipes_by_month_submitted, (str(row['submitted'][:-3]),row['score'],row['id'],str(row['name']))))
        data_insert_recipes_by_difficulty.append((query_insert_recipes_by_difficulty, (str(row['difficulty']),row['score'],row['id'],str(row['name']))))
        data_insert_recipes.append((query_insert_recipes, (str(row['name']),tags,str(row['submitted']),row['score'],row['id'],row['minutes'],row['contributor_id'],nutrition,steps,row['n_steps'],str(row['description']).translate(description_mapping_table),ingredients,row['n_ingredients'],str(row['difficulty']))))
        for tag in tags:
            if(tag != ''):
                data_insert_recipes_by_tag.append((query_insert_recipes_by_tag, (tag,str(row['submitted']),row['id'],row['score'],str(row['name']))))

In [27]:
import time

def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

print("Inserting into recipe.recipes_by_month_submitted")
starting_time = time.time()
execute_concurrent(session, data_insert_recipes_by_month_submitted, raise_on_first_error=True, concurrency=10)
elapsed_time = time.time() - starting_time
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)) + '\n')

Inserting into recipe.recipes_by_month_submitted
Execution time: 00:00:15.494791



In [28]:
print("Inserting into recipe.recipes_by_difficulty")
starting_time = time.time()
execute_concurrent(session, data_insert_recipes_by_difficulty, raise_on_first_error=True, concurrency=10)
elapsed_time = time.time() - starting_time
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)) + '\n')

Inserting into recipe.recipes_by_difficulty
Execution time: 00:00:17.894894



In [29]:
#Splitting in order to reduce RAM consumption since execute_concurrent return values are garbage collected
starting_time = time.time()
i = 1
for data in list(split(data_insert_recipes, 10)):
    print(f"Inserting into recipe.recipes part {i}/10")
    execute_concurrent(session, data, raise_on_first_error=True, concurrency=5)
    i+=1
elapsed_time = time.time() - starting_time
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)) + '\n')

Inserting into recipe.recipes part 1/10
Inserting into recipe.recipes part 2/10
Inserting into recipe.recipes part 3/10
Inserting into recipe.recipes part 4/10
Inserting into recipe.recipes part 5/10
Inserting into recipe.recipes part 6/10
Inserting into recipe.recipes part 7/10
Inserting into recipe.recipes part 8/10
Inserting into recipe.recipes part 9/10
Inserting into recipe.recipes part 10/10
Execution time: 00:00:43.073961



In [31]:
starting_time = time.time()
i = 1
for data in list(split(data_insert_recipes_by_tag, 20)):
    print(f"Inserting into recipe.recipes_by_tag part {i}/20")
    execute_concurrent(session, data, raise_on_first_error=True, concurrency=10)
    i+=1
elapsed_time = time.time() - starting_time
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)) + '\n')

Inserting into recipe.recipes_by_tag part 1/20
Inserting into recipe.recipes_by_tag part 2/20
Inserting into recipe.recipes_by_tag part 3/20
Inserting into recipe.recipes_by_tag part 4/20
Inserting into recipe.recipes_by_tag part 5/20
Inserting into recipe.recipes_by_tag part 6/20
Inserting into recipe.recipes_by_tag part 7/20
Inserting into recipe.recipes_by_tag part 8/20
Inserting into recipe.recipes_by_tag part 9/20
Inserting into recipe.recipes_by_tag part 10/20
Inserting into recipe.recipes_by_tag part 11/20
Inserting into recipe.recipes_by_tag part 12/20
Inserting into recipe.recipes_by_tag part 13/20
Inserting into recipe.recipes_by_tag part 14/20
Inserting into recipe.recipes_by_tag part 15/20
Inserting into recipe.recipes_by_tag part 16/20
Inserting into recipe.recipes_by_tag part 17/20
Inserting into recipe.recipes_by_tag part 18/20
Inserting into recipe.recipes_by_tag part 19/20
Inserting into recipe.recipes_by_tag part 20/20
Execution time: 00:05:12.765662



In [18]:
from cassandra.query import SimpleStatement
import time

#Query 1
starting_time = time.time()

query = session.prepare("SELECT name, month_submitted, score FROM recipe.recipes_by_month_submitted  WHERE month_submitted = ? ORDER BY score DESC LIMIT 30;",)
query.consistency_level = ConsistencyLevel.QUORUM

rows = session.execute(query, ['2012-05'])
i = 0;
for row in rows:
    if i < 20:
        print(f"Name: {row.name}, Month Submitted: {row.month_submitted}, Score: {row.score}");
    i+=1

rows = session.execute(query, ['2012-04'])
for row in rows:
    continue;
rows = session.execute(query, ['2012-03'])
for row in rows:
    continue;
rows = session.execute(query, ['2012-02'])
for row in rows:
    continue;
rows = session.execute(query, ['2012-01'])
for row in rows:
    continue;
rows = session.execute(query, ['2011-12'])
for row in rows:
    continue;
rows = session.execute(query, ['2011-11'])
for row in rows:
    continue;
rows = session.execute(query, ['2011-10'])
for row in rows:
    continue;
rows = session.execute(query, ['2011-09'])
for row in rows:
    continue;
rows = session.execute(query, ['2011-08'])
for row in rows:
    continue;

elapsed_time = (time.time() - starting_time) / 10
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Mean Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))

Name: asparagus sauteed in butter and mustard, Month Submitted: 2012-05, Score: 5.0
Name: black bean   rice enchiladas  inexpensive vegetarian cuisine, Month Submitted: 2012-05, Score: 5.0
Name: ancho lentils, Month Submitted: 2012-05, Score: 5.0
Name: blueberry strata, Month Submitted: 2012-05, Score: 5.0
Name: bacon double cheeseburger dip, Month Submitted: 2012-05, Score: 5.0
Name: ange s awesome cheeseymite scrolls, Month Submitted: 2012-05, Score: 5.0
Name: blueberry rhubarb pie, Month Submitted: 2012-05, Score: 5.0
Name: almost famous chimichangas, Month Submitted: 2012-05, Score: 5.0
Name: asparagus with butter lemon and mint drizzle, Month Submitted: 2012-05, Score: 5.0
Name: bacon tomato gruyere omelet, Month Submitted: 2012-05, Score: 5.0
Name: blue heron s glacier bay smoked omelet, Month Submitted: 2012-05, Score: 5.0
Name: berry banana protein shake, Month Submitted: 2012-05, Score: 5.0
Name: baked celery and onions with herbs, Month Submitted: 2012-05, Score: 5.0
Name: br

In [19]:
#Query 2
starting_time = time.time()

query = session.prepare("SELECT name, difficulty, nutrition, steps, description, score FROM recipe.recipes WHERE name = ? LIMIT 1;")
query.consistency_level = ConsistencyLevel.QUORUM

rows = session.execute(query, ['chick greek salad'])
for row in rows:
    print(f"Name: {row.name},\nDifficulty: {row.difficulty},\nNutrition: {row.nutrition},\nSteps: {row.steps},\nDescription: {row.description},\nScore: {row.score}");

rows = session.execute(query, ['banana pineapple freeze'])
for row in rows:
    continue; rows = session.execute(query, ['swedish beet salad'])
for row in rows:
    continue;
rows = session.execute(query, ['authentic bolillos  mexican bread rolls for tortas'])
for row in rows:
    continue;
rows = session.execute(query, ['pretzels   hard and soft'])
for row in rows:
    continue;
rows = session.execute(query, ['tasty tomato ramen'])
for row in rows:
    continue;
rows = session.execute(query, ['chocolate caramel cheesecake'])
for row in rows:
    continue;
rows = session.execute(query, ['banana berry smoothie  no added sugar  sugarless'])
for row in rows:
    continue;
rows = session.execute(query, ['vegetable couscous'])
for row in rows:
    continue;
rows = session.execute(query, ['vegan pastry cream'])
for row in rows:
    continue;

elapsed_time = (time.time() - starting_time) / 10
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Mean Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))

Mean Execution time: 00:00:00.011079


In [20]:
#Query 3
starting_time = time.time()

query = session.prepare("SELECT name, difficulty, score FROM recipe.recipes_by_difficulty WHERE difficulty=?",)
query.consistency_level = ConsistencyLevel.QUORUM

rows = session.execute(query, ['easy'])
i = 0;
for row in rows:
    if i < 20:
        print(f"Name: {row.name}, Difficulty: {row.difficulty}, Score: {row.score}");
    i+=1

rows = session.execute(query, ['easy'])
for row in rows:
    continue;
rows = session.execute(query, ['medium'])
for row in rows:
    continue;
rows = session.execute(query, ['hard'])
for row in rows:
    continue;
rows = session.execute(query, ['very_hard'])
for row in rows:
    continue;
rows = session.execute(query, ['easy'])
for row in rows:
    continue;
rows = session.execute(query, ['medium'])
for row in rows:
    continue;
rows = session.execute(query, ['hard'])
for row in rows:
    continue;
rows = session.execute(query, ['very_hard'])
for row in rows:
    continue;
rows = session.execute(query, ['easy'])
for row in rows:
    continue;
    
elapsed_time = (time.time() - starting_time) / 10
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))

Name: a jad   cucumber pickle, Difficulty: easy, Score: 5.0
Name: alaskan blueberry pie, Difficulty: easy, Score: 5.0
Name: 3 berry shakes, Difficulty: easy, Score: 5.0
Name: bo peeps, Difficulty: easy, Score: 5.0
Name: bailey s irish cream liqueur, Difficulty: easy, Score: 5.0
Name: blue cheese and pear salad, Difficulty: easy, Score: 5.0
Name: andrea cassoni s buckeyes, Difficulty: easy, Score: 5.0
Name: bayerische erdbeercreme  strawberry barvarian, Difficulty: easy, Score: 5.0
Name: beef liver in mustard sauce, Difficulty: easy, Score: 5.0
Name: apple cheddar cookies, Difficulty: easy, Score: 5.0
Name: banana split sundaes, Difficulty: easy, Score: 5.0
Name: asian snow peas and carrots, Difficulty: easy, Score: 5.0
Name: andouille a la jeannine, Difficulty: easy, Score: 5.0
Name: barbecued beef in crusty rolls, Difficulty: easy, Score: 5.0
Name: beef  apple   arugula appetizers, Difficulty: easy, Score: 5.0
Name: big apple, Difficulty: easy, Score: 5.0
Name: bourbon fog, Difficulty

In [21]:
#Query 4
starting_time = time.time()

query = session.prepare("SELECT name, tag, date_submitted FROM recipe.recipes_by_tag WHERE tag = ? ORDER BY date_submitted;")
query.consistency_level = ConsistencyLevel.QUORUM

rows = session.execute(query, ['crock-pot-slow-cooker'])
i = 0;
for row in rows:
    if i < 20:
        print(f"Name: {row.name}, Tag: {row.tag}, Date Submitted: {row.date_submitted}");
    i+=1

rows = session.execute(query, ['malaysian'])
for row in rows:
    continue;
rows = session.execute(query, ['spanish'])
for row in rows:
    continue;
rows = session.execute(query, ['shakes'])
for row in rows:
    continue;
rows = session.execute(query, ['mixer'])
for row in rows:
    continue;
rows = session.execute(query, ['60-minutes-or-less'])
for row in rows:
    continue;
rows = session.execute(query, ['dietary'])
for row in rows:
    continue;
rows = session.execute(query, ['chocolate'])
for row in rows:
    continue;
rows = session.execute(query, ['for-large-groups'])
for row in rows:
    continue;
rows = session.execute(query, ['taste-mood'])
for row in rows:
    continue;

elapsed_time = (time.time() - starting_time) / 10
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))

Name: all purpose crock pot chicken, Tag: crock-pot-slow-cooker, Date Submitted: 1999-08-10
Name: barbecued pork strips, Tag: crock-pot-slow-cooker, Date Submitted: 1999-08-11
Name: braised lamb shanks with barley, Tag: crock-pot-slow-cooker, Date Submitted: 1999-10-28
Name: artichoke and lamb shanks crock pot dinner, Tag: crock-pot-slow-cooker, Date Submitted: 2001-02-15
Name: arroz con queso  crock pot, Tag: crock-pot-slow-cooker, Date Submitted: 2001-05-13
Name: bergie s crock pot pheasant, Tag: crock-pot-slow-cooker, Date Submitted: 2001-05-29
Name: bbq beef for sandwiches, Tag: crock-pot-slow-cooker, Date Submitted: 2001-07-31
Name: denny s  vegetable beef barley soup, Tag: crock-pot-slow-cooker, Date Submitted: 2001-09-03
Name: all day beef stew  no wine, Tag: crock-pot-slow-cooker, Date Submitted: 2001-09-23
Name: bar b q meatballs, Tag: crock-pot-slow-cooker, Date Submitted: 2001-10-01
Name: beef and potatoes with rosemary and thyme   crock pot, Tag: crock-pot-slow-cooker, Date

In [22]:
#Query 5
starting_time = time.time()

query = session.prepare("SELECT name, tag, score FROM recipe.recipes_by_tag_by_score WHERE tag = ? LIMIT 20;")
query.consistency_level = ConsistencyLevel.QUORUM

rows = session.execute(query, ['cocktail'])
i = 0;
for row in rows:
    if i < 20:
        print(f"Name: {row.name}, Tag: {row.tag}, Score: {row.score}");
    i+=1;
    
rows = session.execute(query, ['to-go'])
for row in rows:
    continue;
rows = session.execute(query, ['kid-friendly'])
for row in rows:
    continue;
rows = session.execute(query, ['dietary'])
for row in rows:
    continue;
rows = session.execute(query, ['desserts'])
for row in rows:
    continue;
rows = session.execute(query, ['occasion'])
for row in rows:
    continue;
rows = session.execute(query, ['seasonal'])
for row in rows:
    continue;
rows = session.execute(query, ['grilling'])
for row in rows:
    continue;
rows = session.execute(query, ['brunch'])
for row in rows:
    continue;
rows = session.execute(query, ['beginner-cook'])
for row in rows:
    continue;

elapsed_time = (time.time() - starting_time) / 10
mlsec = repr(elapsed_time).split('.')[1][:6]
print('Execution time:', time.strftime("%H:%M:%S.{}".format(mlsec), time.gmtime(elapsed_time)))

Execution time: 00:00:00.015098


In [None]:
cluster.shutdown()