In [1]:
%load_ext autoreload
%autoreload 2

# Frappe

In [2]:
from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.sparse

import src.preprocessing.preprocessing as preprocessing
import src.util as util
import src.io as io

In [3]:
# Configure parameters for processing dataset

USER_ID = "user"
ITEM_ID = "item"

MIN_ITEMS_PER_USER = 3
MIN_USERS_PER_ITEM = 1

BASE_PATH = Path('../../data/CARS/Mobile_Frappe/')
INPUT_FILENAME = BASE_PATH / 'raw/frappe.csv'

In [4]:
# These shouldn't be changed
USER_ID_OUT = "user"
ITEM_ID_OUT = "item"

In [5]:
df = pd.read_csv(INPUT_FILENAME, sep="\t")
df

Unnamed: 0,user,item,cnt,daytime,weekday,isweekend,homework,cost,weather,country,city
0,0,0,1,morning,sunday,weekend,unknown,free,sunny,United States,0
1,1,1,7,afternoon,saturday,weekend,unknown,free,cloudy,Spain,0
2,2,2,6,evening,monday,workday,unknown,free,cloudy,Spain,369
3,3,3,1,sunset,thursday,workday,unknown,free,unknown,United States,1028
4,4,4,428,night,thursday,workday,home,free,sunny,Switzerland,147
...,...,...,...,...,...,...,...,...,...,...,...
96198,110,0,5,evening,sunday,weekend,unknown,free,sunny,United States,0
96199,37,16,101,sunset,sunday,weekend,unknown,free,cloudy,Canada,128
96200,181,33,243,afternoon,sunday,weekend,unknown,free,cloudy,Israel,454
96201,451,752,1,evening,sunday,weekend,unknown,free,sunny,United States,0


In [6]:
df.nunique()

user          957
item         4082
cnt          1981
daytime         7
weekday         7
isweekend       2
homework        3
cost            2
weather         9
country        80
city          233
dtype: int64

In [7]:
df['homework'].value_counts(normalize=True)

unknown    0.786566
home       0.163935
work       0.049499
Name: homework, dtype: float64

In [8]:
# only keep meaningful context
dropColumns = ["cnt", "isweekend", "homework", "cost", "country", "city"]
df.drop(columns=dropColumns, inplace = True)
df

Unnamed: 0,user,item,daytime,weekday,weather
0,0,0,morning,sunday,sunny
1,1,1,afternoon,saturday,cloudy
2,2,2,evening,monday,cloudy
3,3,3,sunset,thursday,unknown
4,4,4,night,thursday,sunny
...,...,...,...,...,...
96198,110,0,evening,sunday,sunny
96199,37,16,sunset,sunday,cloudy
96200,181,33,afternoon,sunday,cloudy
96201,451,752,evening,sunday,sunny


Remove interactions with users or items that are not present enough in the dataset.

In [9]:
df = preprocessing.k_core(df, USER_ID, ITEM_ID, MIN_ITEMS_PER_USER, MIN_USERS_PER_ITEM)
df[USER_ID].nunique(), df[ITEM_ID].nunique()

(816, 4058)

In [10]:
df, = util.remap_ids(df, col=ITEM_ID)
df, = util.remap_ids(df, col=USER_ID)
df

Unnamed: 0,user,item,daytime,weekday,weather
0,0,0,morning,sunday,sunny
1,1,1,afternoon,saturday,cloudy
2,2,2,evening,monday,cloudy
3,3,3,sunset,thursday,unknown
4,4,4,night,thursday,sunny
...,...,...,...,...,...
96198,109,0,evening,sunday,sunny
96199,37,16,sunset,sunday,cloudy
96200,180,33,afternoon,sunday,cloudy
96201,445,751,evening,sunday,sunny


## Non-frequent context values

Some context values are not much present in the dataset, so we set them on unknown such that we do not use them in the model

In [11]:
print( df["weather"].value_counts())
df["weather"] = df["weather"].replace( "sleet", "unknown", regex=True)

cloudy     40514
sunny      25069
unknown    12497
foggy       7653
rainy       6500
stormy      2811
drizzle      588
snowy        369
sleet          1
Name: weather, dtype: int64


Here we deal with the case that for a certain interaction a context variable is not present.
We first make a dictionary with for each context-variable, the value in that column that corresponds to an unknown value

In [12]:
cols = df.columns.tolist()
cols.remove(ITEM_ID)
cols.remove(USER_ID)
symbolUnknownPerCol = {col: 'unknown' for col in cols}

df = df.apply( lambda x: x.replace( symbolUnknownPerCol.get(x.name, -1), np.nan)) # Unknown values are all set on np.nan

In [13]:
df

Unnamed: 0,user,item,daytime,weekday,weather
0,0,0,morning,sunday,sunny
1,1,1,afternoon,saturday,cloudy
2,2,2,evening,monday,cloudy
3,3,3,sunset,thursday,
4,4,4,night,thursday,sunny
...,...,...,...,...,...
96198,109,0,evening,sunday,sunny
96199,37,16,sunset,sunday,cloudy
96200,180,33,afternoon,sunday,cloudy
96201,445,751,evening,sunday,sunny


## Inspect duplicates

### duplicates with same context

In [14]:
df.duplicated().value_counts()

False    89139
True      6863
dtype: int64

### duplicates of user-item interactions

In [15]:
df.duplicated([USER_ID, ITEM_ID]).value_counts()

True     77353
False    18649
dtype: int64

More duplicates than not. Hence we will consider the retargetting scenario for this dataset.

## Map to int
In each column, each unique value gets a unique number. Unknown values get the value 0. We make per context a dictionary with as values each unique value in < colName > and keys the associated integer.

In [16]:
for col in df.columns:
    if col in {USER_ID, ITEM_ID}:
        continue
    df[col], uniques = pd.factorize(df[col], na_sentinel = -1 )
    df[col] = df[col] + 1
     
    int2category_d = json.dumps({v+1: k for v, k in enumerate(uniques)})
    # open file for writing, "w" 
    f = open(BASE_PATH / f"int2category_{col}.json","w")

    # write json object to file
    f.write(int2category_d)

    # close file
    f.close()
    
    print(col)
    print( int2category_d)
    print( '--------------------------')
df

daytime
{"1": "morning", "2": "afternoon", "3": "evening", "4": "sunset", "5": "night", "6": "noon", "7": "sunrise"}
--------------------------
weekday
{"1": "sunday", "2": "saturday", "3": "monday", "4": "thursday", "5": "wednesday", "6": "tuesday", "7": "friday"}
--------------------------
weather
{"1": "sunny", "2": "cloudy", "3": "foggy", "4": "rainy", "5": "stormy", "6": "drizzle", "7": "snowy"}
--------------------------


Unnamed: 0,user,item,daytime,weekday,weather
0,0,0,1,1,1
1,1,1,2,2,2
2,2,2,3,3,2
3,3,3,4,4,0
4,4,4,5,4,1
...,...,...,...,...,...
96198,109,0,3,1,1
96199,37,16,4,1,2
96200,180,33,2,1,2
96201,445,751,3,1,1


In [17]:
dfContext = df.loc[:, ~df.columns.isin([USER_ID, ITEM_ID])]
nbrContextFeatures = dfContext.nunique()

In [18]:
print("\n".join([
    f"Entries: {len(df)}",
    f"Users: {df[USER_ID].nunique()}",
    f"Items: {df[ITEM_ID].nunique()}",
    f"Context variables: {len(df.columns)-2}",
    f"Context features:\n{nbrContextFeatures}"    # weather has 0 for nan -> only 7 features, not 8
]))

Entries: 96002
Users: 816
Items: 4058
Context variables: 3
Context features:
daytime    7
weekday    7
weather    8
dtype: int64


## Sparsity pattern per context

Check percentage of interactions with unknown context attribute per context

In [19]:
dfContext.apply(lambda x: sum(x == 0) / len(x))

daytime    0.000000
weekday    0.000000
weather    0.130185
dtype: float64

## save dataframe

In [20]:
df.rename(columns = {USER_ID: USER_ID_OUT, ITEM_ID: ITEM_ID_OUT}, inplace = True)
df.to_csv( BASE_PATH / f"interactions.csv", index = False)