In [1]:
%load_ext autoreload
%autoreload 2

# Food.com

In [2]:
from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.sparse

import src.preprocessing.preprocessing as preprocessing
import src.util as util
import src.io as io

In [3]:
# Configure parameters for processing dataset

USER_ID = "user_id"
ITEM_ID = "recipe_id"
RATING_ID = "rating"

MIN_RATING = 3
MIN_ITEMS_PER_USER = 3
MIN_USERS_PER_ITEM = 10

BASE_PATH = Path('../../data/CARS/Food_com/')
INPUT_FILENAME = BASE_PATH / 'raw/interactions.csv'

In [4]:
# These shouldn't be changed
USER_ID_OUT = "user"
ITEM_ID_OUT = "item"

In [5]:
df = pd.read_csv(INPUT_FILENAME, sep=",")
df

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [6]:
df = df[df[RATING_ID] >= MIN_RATING]
df

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132360,2002357020,82303,2018-12-05,5,Delicious quick thick chocolate sauce with ing...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [7]:
df["season"]  = df["date"].apply( preprocessing.date2season)
df["weekday"] = df["date"].apply( preprocessing.date2weekday)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["season"]  = df["date"].apply( preprocessing.date2season)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["weekday"] = df["date"].apply( preprocessing.date2weekday)


In [8]:
df.nunique()

user_id       186069
recipe_id     223590
date            6390
rating             3
review       1038247
season             4
weekday            7
dtype: int64

In [9]:
df['season'].value_counts()

winter    276624
spring    262600
summer    256158
autumn    249197
Name: season, dtype: int64

In [10]:
df['weekday'].value_counts(normalize=True)

Sunday       0.162454
Monday       0.161750
Tuesday      0.148487
Wednesday    0.139786
Thursday     0.134122
Saturday     0.130518
Friday       0.122883
Name: weekday, dtype: float64

In [11]:
# only keep meaningful context
dropColumns = ["rating", "review", "date"]
df.drop(columns=dropColumns, inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=dropColumns, inplace = True)


Unnamed: 0,user_id,recipe_id,season,weekday
0,38094,40893,winter,Monday
1,1293707,40893,winter,Wednesday
2,8937,44394,autumn,Sunday
3,126440,85009,winter,Saturday
4,57222,85009,autumn,Saturday
...,...,...,...,...
1132360,2002357020,82303,autumn,Wednesday
1132363,583662,386618,autumn,Tuesday
1132364,157126,78003,summer,Monday
1132365,53932,78003,winter,Sunday


Remove interactions with users or items that are not present enough in the dataset.

In [12]:
df = preprocessing.k_core(df, USER_ID, ITEM_ID, MIN_ITEMS_PER_USER, MIN_USERS_PER_ITEM)
df[USER_ID].nunique(), df[ITEM_ID].nunique()

(22178, 15086)

In [13]:
df, = util.remap_ids(df, col=ITEM_ID)
df, = util.remap_ids(df, col=USER_ID)
df

Unnamed: 0,user_id,recipe_id,season,weekday
31,19211,6280,autumn,Saturday
43,17684,6280,winter,Monday
44,12110,6280,spring,Tuesday
45,7184,6280,spring,Saturday
46,17837,6280,winter,Friday
...,...,...,...,...
1132309,895,7799,spring,Sunday
1132354,4100,7799,summer,Monday
1132355,2940,7799,autumn,Friday
1132356,757,7799,winter,Monday


## Inspect duplicates

### duplicates with same context

In [14]:
df.duplicated().value_counts()

False    388362
dtype: int64

### duplicates of user-item interactions

In [15]:
df.duplicated([USER_ID, ITEM_ID]).value_counts()

False    388362
dtype: int64

No duplicates -> don't consider retargetting.

In [16]:
# df.drop_duplicates([USER_ID, ITEM_ID], inplace=True)
# df.duplicated([USER_ID, ITEM_ID]).value_counts()

## Map to int
In each column, each unique value gets a unique number. Unknown values get the value 0. We make per context a dictionary with as values each unique value in < colName > and keys the associated integer.

In [17]:
for col in df.columns:
    if col in {USER_ID, ITEM_ID}:
        continue
    df[col], uniques = pd.factorize(df[col], na_sentinel = -1 )
    df[col] = df[col] + 1
     
    int2category_d = json.dumps({v+1: k for v, k in enumerate(uniques)})
    # open file for writing, "w" 
    f = open(BASE_PATH / f"int2category_{col}.json","w")

    # write json object to file
    f.write(int2category_d)

    # close file
    f.close()
    
    print(col)
    print( int2category_d)
    print( '--------------------------')
df

season
{"1": "autumn", "2": "winter", "3": "spring", "4": "summer"}
--------------------------
weekday
{"1": "Saturday", "2": "Monday", "3": "Tuesday", "4": "Friday", "5": "Wednesday", "6": "Thursday", "7": "Sunday"}
--------------------------


Unnamed: 0,user_id,recipe_id,season,weekday
31,19211,6280,1,1
43,17684,6280,2,2
44,12110,6280,3,3
45,7184,6280,3,1
46,17837,6280,2,4
...,...,...,...,...
1132309,895,7799,3,7
1132354,4100,7799,4,2
1132355,2940,7799,1,4
1132356,757,7799,2,2


In [18]:
dfContext = df.loc[:, ~df.columns.isin([USER_ID, ITEM_ID])]
nbrContextFeatures = dfContext.nunique()

In [19]:
print("\n".join([
    f"Entries: {len(df)}",
    f"Users: {df[USER_ID].nunique()}",
    f"Items: {df[ITEM_ID].nunique()}",
    f"Context variables: {len(df.columns)-2}",
    f"Context features:\n{nbrContextFeatures}"
]))

Entries: 388362
Users: 22178
Items: 15086
Context variables: 2
Context features:
season     4
weekday    7
dtype: int64


## Sparsity pattern per context

Check percentage of interactions with unknown context attribute per context

In [20]:
dfContext.apply( lambda x: sum(x == 0)/len(x))

season     0.0
weekday    0.0
dtype: float64

## save dataframe

In [21]:
df.rename(columns = {USER_ID: USER_ID_OUT, ITEM_ID: ITEM_ID_OUT}, inplace = True)
df.to_csv( BASE_PATH / f"interactions.csv", index = False)