In [1]:
%load_ext autoreload
%autoreload 2

# TripAdvisor

In [2]:
from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.sparse

import src.preprocessing.preprocessing as preprocessing
import src.util as util
import src.io as io

In [3]:
# Configure parameters for processing dataset

USER_ID = "UserID"
ITEM_ID = "ItemID"
RATING_ID = "Rating"

MIN_RATING = 3
MIN_ITEMS_PER_USER = 3
MIN_USERS_PER_ITEM = 1

BASE_PATH = Path('../../data/CARS/TripAdvisor/')
INPUT_FILENAME = BASE_PATH / 'raw/TripAdvisor.csv'

In [4]:
# These shouldn't be changed
USER_ID_OUT = "user"
ITEM_ID_OUT = "item"

In [5]:
df = pd.read_csv(INPUT_FILENAME, sep=",")
df

Unnamed: 0,UserID,ItemID,Rating,UserState,UserTimeZone,ItemCity,ItemState,ItemTimeZone,TripType
0,5C28F393B23BB894523AE7126A7AE445,219668,5,AK,AK,GREENSBORO,NC,EASTERN,SOLO
1,3FA27F6E8AC712A82C69C4EDD8B912CC,223860,5,AK,AK,PHOENIX,AZ,MOUNTAIN,SOLO
2,B99CFBB5411EDC8881D13B7A4B313ADA,75680,5,AK,AK,ANAHEIM,CA,PACIFIC,FAMILY
3,3FA27F6E8AC712A82C69C4EDD8B912CC,224783,5,AK,AK,SEATTLE,WA,PACIFIC,SOLO
4,7CEFF5C32BA1F3B186E7838C7D3FE25E,222984,5,AK,AK,MIAMI,MI,EASTERN,COUPLES
...,...,...,...,...,...,...,...,...,...
14170,C91F0C89C6728994C901BD27C8AB41C5,85377,1,WA,PACIFIC,AURORA,CO,MOUNTAIN,SOLO
14171,23BF577E80F1020C8D2F62EF1C688D97,1200603,1,WI,CENTRAL,SEATTLE,WA,PACIFIC,SOLO
14172,7DA3B06F0778389371CB179A388D4CFE,72993,1,WI,CENTRAL,SALT LAKE CITY,UT,MOUNTAIN,FRIENDS
14173,01ED60735C3F50974C5F774A648FD8B7,1236714,1,WI,CENTRAL,CINCINNATI,OH,EASTERN,FAMILY


In [6]:
df = df[df[RATING_ID] >= MIN_RATING]
df

Unnamed: 0,UserID,ItemID,Rating,UserState,UserTimeZone,ItemCity,ItemState,ItemTimeZone,TripType
0,5C28F393B23BB894523AE7126A7AE445,219668,5,AK,AK,GREENSBORO,NC,EASTERN,SOLO
1,3FA27F6E8AC712A82C69C4EDD8B912CC,223860,5,AK,AK,PHOENIX,AZ,MOUNTAIN,SOLO
2,B99CFBB5411EDC8881D13B7A4B313ADA,75680,5,AK,AK,ANAHEIM,CA,PACIFIC,FAMILY
3,3FA27F6E8AC712A82C69C4EDD8B912CC,224783,5,AK,AK,SEATTLE,WA,PACIFIC,SOLO
4,7CEFF5C32BA1F3B186E7838C7D3FE25E,222984,5,AK,AK,MIAMI,MI,EASTERN,COUPLES
...,...,...,...,...,...,...,...,...,...
13271,84CBA058D71F9F9741BE0E1A390D9C04,75097,3,WV,EASTERN,TEMPE,AZ,MOUNTAIN,COUPLES
13272,1D9F982DCCCFC080D22F0CC445BD21E1,260963,3,WV,EASTERN,VIRGINIABEACH,VA,EASTERN,COUPLES
13273,21442E7C01B7AE57B130B670565806EF,120100,3,WY,MOUNTAIN,CENTENNIAL,CO,MOUNTAIN,FAMILY
13274,399D9B03E2255EBEF6C1E457F1E3E881,1888977,3,WY,MOUNTAIN,NEWYORK,NY,EASTERN,FRIENDS


In [7]:
df.nunique()

UserID          2371
ItemID          2223
Rating             3
UserState         79
UserTimeZone      35
ItemCity         107
ItemState         31
ItemTimeZone       4
TripType           5
dtype: int64

In [8]:
df['UserState'].value_counts(normalize=True)

CA            0.132043
TX            0.072462
NY            0.057096
FL            0.048132
IL            0.040600
                ...   
SPAIN         0.000377
VENEZUELA     0.000377
VT            0.000377
SWEDEN        0.000301
HINTERLAND    0.000301
Name: UserState, Length: 79, dtype: float64

In [9]:
df['TripType'].value_counts(normalize=True)

COUPLES     0.332103
SOLO        0.211509
FAMILY      0.206463
BUSINESS    0.196897
FRIENDS     0.053028
Name: TripType, dtype: float64

In [10]:
# only keep meaningful context
dropColumns = ["Rating", "ItemCity", "ItemState", "ItemTimeZone", "UserTimeZone"]
df.drop(columns=dropColumns, inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=dropColumns, inplace = True)


Unnamed: 0,UserID,ItemID,UserState,TripType
0,5C28F393B23BB894523AE7126A7AE445,219668,AK,SOLO
1,3FA27F6E8AC712A82C69C4EDD8B912CC,223860,AK,SOLO
2,B99CFBB5411EDC8881D13B7A4B313ADA,75680,AK,FAMILY
3,3FA27F6E8AC712A82C69C4EDD8B912CC,224783,AK,SOLO
4,7CEFF5C32BA1F3B186E7838C7D3FE25E,222984,AK,COUPLES
...,...,...,...,...
13271,84CBA058D71F9F9741BE0E1A390D9C04,75097,WV,COUPLES
13272,1D9F982DCCCFC080D22F0CC445BD21E1,260963,WV,COUPLES
13273,21442E7C01B7AE57B130B670565806EF,120100,WY,FAMILY
13274,399D9B03E2255EBEF6C1E457F1E3E881,1888977,WY,FRIENDS


Remove interactions with users or items that are not present enough in the dataset.

In [11]:
df = preprocessing.k_core(df, USER_ID, ITEM_ID, MIN_ITEMS_PER_USER, MIN_USERS_PER_ITEM)
df[USER_ID].nunique(), df[ITEM_ID].nunique()

(2362, 2221)

In [12]:
df, = util.remap_ids(df, col=ITEM_ID)
df, = util.remap_ids(df, col=USER_ID)
df

Unnamed: 0,UserID,ItemID,UserState,TripType
0,1899,1865,AK,SOLO
1,466,721,AK,SOLO
2,1992,451,AK,FAMILY
3,466,1016,AK,SOLO
4,1441,414,AK,COUPLES
...,...,...,...,...
13271,936,341,WV,COUPLES
13272,1545,1936,WV,COUPLES
13273,399,1556,WY,FAMILY
13274,918,1380,WY,FRIENDS


## Non-frequent context values

Some context values are not much present in the dataset, so we set them on unknown such that we do not use them in the model

In [13]:
state_counts = df["UserState"].value_counts()
state_counts

CA            1753
TX             962
NY             758
FL             637
IL             539
              ... 
SPAIN            5
VENEZUELA        5
VT               5
SWEDEN           4
HINTERLAND       4
Name: UserState, Length: 79, dtype: int64

In [14]:
# Dropping <= 50 leads to sparsity of:
# UserState    0.053251

# infrequent_states = state_counts[state_counts <= 50]
# infrequent_states

In [15]:
# df.loc[df['UserState'].isin(infrequent_states.index), 'UserState'] = np.nan
# df["UserState"].value_counts()

## Inspect duplicates

### duplicates with same context

In [16]:
df.duplicated().value_counts()

False    12975
True       283
dtype: int64

### duplicates of user-item interactions

In [17]:
df.duplicated([USER_ID, ITEM_ID]).value_counts()

False    12836
True       422
dtype: int64

Very few duplicates -> drop them and don't consider retargetting.

In [18]:
df.drop_duplicates([USER_ID, ITEM_ID], inplace=True)
df.duplicated([USER_ID, ITEM_ID]).value_counts()

False    12836
dtype: int64

## Map to int
In each column, each unique value gets a unique number. Unknown values get the value 0. We make per context a dictionary with as values each unique value in < colName > and keys the associated integer.

In [19]:
for col in df.columns:
    if col in {USER_ID, ITEM_ID}:
        continue
    df[col], uniques = pd.factorize(df[col], na_sentinel = -1 )
    df[col] = df[col] + 1
     
    int2category_d = json.dumps({v+1: k for v, k in enumerate(uniques)})
    # open file for writing, "w" 
    f = open(BASE_PATH / f"int2category_{col}.json","w")

    # write json object to file
    f.write(int2category_d)

    # close file
    f.close()
    
    print(col)
    print( int2category_d)
    print( '--------------------------')
df

UserState
{"1": "AK", "2": "AL", "3": "AR", "4": "ARGENTINA", "5": "AUSTRALIA", "6": "AUSTRIA", "7": "AZ", "8": "BRAZIL", "9": "BULGARIA", "10": "CA", "11": "CANADA", "12": "CHILE", "13": "CHINA", "14": "CO", "15": "CT", "16": "DC", "17": "DE", "18": "DENMARK", "19": "FL", "20": "FRANCE", "21": "GA", "22": "GERMANY", "23": "HI", "24": "HINTERLAND", "25": "IA", "26": "ID", "27": "IL", "28": "IN", "29": "INDIA", "30": "IRELAND", "31": "ISRAEL", "32": "ITALY", "33": "JAPAN", "34": "KS", "35": "KY", "36": "LA", "37": "MA", "38": "MD", "39": "ME", "40": "MEXICO", "41": "MI", "42": "MN", "43": "MO", "44": "MS", "45": "MT", "46": "NB", "47": "NC", "48": "NETHERLANDS", "49": "NEWZEALAND", "50": "NH", "51": "NJ", "52": "NM", "53": "NORWAY", "54": "NV", "55": "NW", "56": "NY", "57": "OH", "58": "OK", "59": "OR", "60": "PA", "61": "PR", "62": "RI", "63": "SC", "64": "SD", "65": "SINGAPORE", "66": "SPAIN", "67": "SWEDEN", "68": "SWITZERLAND", "69": "TN", "70": "TX", "71": "UK", "72": "UT", "73": "

Unnamed: 0,UserID,ItemID,UserState,TripType
0,1899,1865,1,1
1,466,721,1,1
2,1992,451,1,2
3,466,1016,1,1
4,1441,414,1,3
...,...,...,...,...
13271,936,341,78,3
13272,1545,1936,78,3
13273,399,1556,79,2
13274,918,1380,79,5


In [20]:
dfContext = df.loc[:, ~df.columns.isin([USER_ID, ITEM_ID])]
nbrContextFeatures = dfContext.nunique()

In [21]:
print("\n".join([
    f"Entries: {len(df)}",
    f"Users: {df[USER_ID].nunique()}",
    f"Items: {df[ITEM_ID].nunique()}",
    f"Context variables: {len(df.columns)-2}",
    f"Context features:\n{nbrContextFeatures}"
]))

Entries: 12836
Users: 2362
Items: 2221
Context variables: 2
Context features:
UserState    79
TripType      5
dtype: int64


## Sparsity pattern per context

Check percentage of interactions with unknown context attribute per context

In [22]:
dfContext.apply( lambda x: sum(x == 0)/len(x))

UserState    0.0
TripType     0.0
dtype: float64

## save dataframe

In [23]:
df.rename(columns = {USER_ID: USER_ID_OUT, ITEM_ID: ITEM_ID_OUT}, inplace = True)
df.to_csv( BASE_PATH / f"interactions.csv", index = False)