## Part 1 - gen Output 1

In [103]:
import pandas
import platform

def rescaling(num) -> float:
    if num == 0:
        return 1
    
    return 1 + ( num * ( 4 / 5 ) )

data_dir_path: str = "C:\\deere\data\\by_project\\leo foods\\" if "Windows" in platform.platform() else "/mnt/c/deere/data/by_project/leo foods/"
file_name_interations: str = "RAW_interactions.csv"
file_name_recipes: str = "RAW_recipes.csv"
file_name_output1: str = "output1.parquet"
file_name_output2: str = "output2.parquet"

# Create dataframes 
df_recipes: pandas.DataFrame = pandas.read_csv(data_dir_path + file_name_recipes)
df_interations: pandas.DataFrame = pandas.read_csv(data_dir_path + file_name_interations)
# Drop unnecessary columns
del df_recipes["name"]
del df_recipes["minutes"]
del df_recipes["contributor_id"]
del df_recipes["submitted"]
del df_recipes["nutrition"]
del df_recipes["n_steps"]
del df_recipes["steps"]
del df_recipes["ingredients"]
del df_recipes["n_ingredients"]
del df_interations["date"]

# Merge dataframes
df_output1 = pandas.merge(df_interations, df_recipes, left_on="recipe_id", right_on="id" )

# Enhance data
# Treat null values in description column
df_output1["description"].fillna(value="", inplace = True)
# Re-scaling rating: 0 -> 1 and the others -> (n - 1) + (4/5)
df_output1["rating"] = df_output1["rating"].apply(rescaling)
# Typing tags values like Python list
df_output1["tags"] = df_output1["tags"].apply(eval)

# Outputing in a parquet file
df_output1.to_parquet(data_dir_path + file_name_output1, engine = 'pyarrow', compression = 'gzip')

## Part 2 - gen Output 2

In [112]:
# Filters and tops to output2
rating_filter = (df_output1["rating"] == 1) | (df_output1["rating"] == 5)
srs_TOP1000_reviewer = df_interations["user_id"].value_counts().head(1000)
srs_TOP1000_reviewed = df_interations["recipe_id"].value_counts().head(1000)

# Apply filters
df_output2 = df_output1[rating_filter]
df_output2 = pandas.merge(df_output2, srs_TOP1000_reviewer, left_on="user_id", right_index=True)
df_output2 = pandas.merge(df_output2, srs_TOP1000_reviewed, left_on="recipe_id", right_index=True)[["user_id", "recipe_id", "rating", "review", "tags", "description"]]

# Outputing in a parquet file
df_output2.to_parquet(data_dir_path + file_name_output2, engine = 'pyarrow', compression = 'gzip')

### Test and research

In [114]:
df_output2

Unnamed: 0,user_id,recipe_id,rating,review,tags,description
5666,126440,63828,5.0,Turned out great so moist and nice.,"[15-minutes-or-less, time-to-make, course, mai...",my #1 way to cook a pork tenderloin or a bonel...
5495,140132,63828,5.0,Came out perfectly cooked ..... tender adn juicy.,"[15-minutes-or-less, time-to-make, course, mai...",my #1 way to cook a pork tenderloin or a bonel...
5549,560491,63828,5.0,My oven only goes to 500 degrees so set it for...,"[15-minutes-or-less, time-to-make, course, mai...",my #1 way to cook a pork tenderloin or a bonel...
5419,195175,63828,5.0,"Fantastic!!! DH kept reading the recipe, as I ...","[15-minutes-or-less, time-to-make, course, mai...",my #1 way to cook a pork tenderloin or a bonel...
5399,95743,63828,1.0,Hate to buck the trend here but this didn't wo...,"[15-minutes-or-less, time-to-make, course, mai...",my #1 way to cook a pork tenderloin or a bonel...
...,...,...,...,...,...,...
1052095,294000,92647,5.0,We were out of milk and my kids wanted pancake...,"[lactose, 30-minutes-or-less, time-to-make, co...",my oldest daughter was severely allergic to mi...
1016192,468146,165254,5.0,Kimc12 my steaks came out perfect! I used the ...,"[30-minutes-or-less, time-to-make, course, mai...",i found this cooking guide on lindauer family ...
1016223,1800054678,165254,5.0,This is such an educational and fantastic reci...,"[30-minutes-or-less, time-to-make, course, mai...",i found this cooking guide on lindauer family ...
1016211,673444,165254,5.0,Great easy technique for steaks. Thanks for po...,"[30-minutes-or-less, time-to-make, course, mai...",i found this cooking guide on lindauer family ...
