# Content-based Filtering

Content-based filtering uses item features to recommend other items similar to what the user likes, based on their previous actions or explicit feedback.

In [1]:
import pandas as pd
import sklearn
print(sklearn.__version__)
print(pd.__version__)
#from pandasgui import show
import pickle
from CBR import CBR
from CBR import CBRDO
import numpy as np

1.0.2
1.3.5


In [2]:
%load_ext autoreload
%autoreload 2

## Load the data

In [3]:
ITEMS_PATH = r"../EDA/CBR_OneHot.xlsx"
represented_items = pd.read_excel(ITEMS_PATH)

In [4]:
represented_items.columns.to_list()

['id',
 'name_no_year',
 'price',
 'points',
 'tinto',
 'blanco',
 'tinto crianza',
 'rosado',
 'blanco espumoso brut nature reserva',
 'blanco espumoso brut nature gran reserva',
 'blanco espumoso brut nature',
 'blanco espumoso brut reserva',
 'blanco espumoso brut',
 'tinto reserva',
 'blanco fermentado en barrica',
 'tinto barrica',
 'rosado espumoso brut',
 'rosado espumoso brut reserva',
 'blanco espumoso brut gran reserva',
 'blanco dulce',
 'tinto gran reserva',
 'tinto roble',
 'rosado espumoso brut nature reserva',
 'rosado espumoso brut nature',
 'blanco espumoso semiseco',
 'blanco crianza',
 'blanco barrica',
 'blanco espumoso extra brut reserva',
 'blanco fortificado dulce solera',
 'espumoso',
 'garnacha',
 'xarel.lo',
 'macabeo',
 'parellada',
 'cariñena',
 'chardonnay',
 'syrah',
 'cabernet sauvignon',
 'garnacha blanca',
 'merlot',
 'pinot noir',
 'tempranillo',
 'trepat',
 'sauvignon blanc',
 'monastrell',
 'cabernet franc',
 'moscatel',
 'sumoll',
 'malvasía de sitg

## Feature transformation

In [5]:
represented_items.price.isna().sum()

145

In [6]:
from sklearn.preprocessing import normalize

# normalize price
# Fill na values with 0. This is not 100% correct but as we only have 140 values out of 2700
represented_items.price = represented_items.price.fillna(0)
represented_items.price = normalize(represented_items.price.values.reshape(-1, 1),axis=0)

# normalize points
represented_items.points = normalize(represented_items.points.values.reshape(-1, 1),axis=0)

## Simple CBR
### Cosine Similarity matrix

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
features = represented_items.columns.tolist()[2:]
features

['price',
 'points',
 'tinto',
 'blanco',
 'tinto crianza',
 'rosado',
 'blanco espumoso brut nature reserva',
 'blanco espumoso brut nature gran reserva',
 'blanco espumoso brut nature',
 'blanco espumoso brut reserva',
 'blanco espumoso brut',
 'tinto reserva',
 'blanco fermentado en barrica',
 'tinto barrica',
 'rosado espumoso brut',
 'rosado espumoso brut reserva',
 'blanco espumoso brut gran reserva',
 'blanco dulce',
 'tinto gran reserva',
 'tinto roble',
 'rosado espumoso brut nature reserva',
 'rosado espumoso brut nature',
 'blanco espumoso semiseco',
 'blanco crianza',
 'blanco barrica',
 'blanco espumoso extra brut reserva',
 'blanco fortificado dulce solera',
 'espumoso',
 'garnacha',
 'xarel.lo',
 'macabeo',
 'parellada',
 'cariñena',
 'chardonnay',
 'syrah',
 'cabernet sauvignon',
 'garnacha blanca',
 'merlot',
 'pinot noir',
 'tempranillo',
 'trepat',
 'sauvignon blanc',
 'monastrell',
 'cabernet franc',
 'moscatel',
 'sumoll',
 'malvasía de sitges',
 'corpulento',
 'sa

In [9]:
similarity = pd.DataFrame(cosine_similarity(represented_items[features]))
similarity["wine_id"] = represented_items.id

## Testing a single wine

In [10]:
def get_most_similar_wines(wine_id):
   wine_index = represented_items[represented_items.id == wine_id].index.values[0]
   return similarity[[wine_index,"wine_id"]].sort_values(by=wine_index,ascending=False)

In [11]:
get_most_similar_wines("martinet_bru_2018_t").head(10)

Unnamed: 0,1021,wine_id
1021,1.0,martinet_bru_2018_t
1149,0.857144,nassos_2018_t_1
947,0.776909,mas_de_la_rosa_2018_t_c
701,0.759077,priorat_idus_de_vallllach_2019_t
1334,0.759077,gotes_del_montsant_2018_t
997,0.759077,terrs_2018_t
414,0.759076,les_crestes_2019_t
781,0.759056,arbossar_2019_t_c_1
1083,0.75591,perpetual_2018_t_c
1069,0.741257,porrera_vi_de_vila_de_alvarez_duran_2018_t


In [12]:
get_most_similar_wines("mestres_visol_2013_be_gr_bn").head(10)

Unnamed: 0,2263,wine_id
2263,1.0,mestres_visol_2013_be_gr_bn
2277,0.866027,pares_balt_blanca_cusine_2013_be_gr_bn
2282,0.816477,agusti_torello_mata_kripta_2013_be_gr_bn
1789,0.79386,giro_ribot_mare_2017_be_gr_bn
2161,0.771519,segura_viudas_reserva_heredad_2015_be_gr_br
2304,0.771517,vives_ambrs_tradicio_magnum_2012_besp_gr
2275,0.771494,juve__camps_blanc_de_noirs_magnum_2013_be_gr_br
2387,0.770134,montesquius_1918_magnum_2004_be_gr_bn
2356,0.770118,bassegues_2009_be_gr
2217,0.748459,mastinell_carpe_diem_2014_be_gr_bn


In [13]:
get_most_similar_wines("jaspi_negre_2017_t").head(10)

Unnamed: 0,1675,wine_id
1675,1.0,jaspi_negre_2017_t
809,0.880709,can_blau_2019_t
1315,0.846159,vespres_2018_t
1458,0.815379,scala_dei_prior_2017_t_c
1422,0.800647,el_brindis_2018_t
1516,0.800647,vinya_gaso_2017_t_c
1511,0.800647,crossos_priorat_2017_t
787,0.800647,blau_2019_t
1538,0.800644,figuerals_garnatxa_2017_t
937,0.769238,clos_berenguer_min_2018_t


## Put it all together in a Python Class

The main objective is to have a class that can be saved into a pickle file to be loaded in the App. 

In *utils* we've created a class Object called CBR that will fullill this propose.

In [14]:
predictor = CBR(represented_items,features,"id")

In [15]:
predictor.predict("purgatori_2017_t_ba")

Unnamed: 0_level_0,CS
id,Unnamed: 1_level_1
finca_sios_2017_t_c,0.716127
jaspi_negre_2017_t,0.716122
scala_dei_prior_2017_t_c,0.690078
can_blau_2019_t,0.670831
microvinificaciones_syrah_del_moret_2017_t,0.670824
...,...
ars_collecta_459_2010_be_gr_br,0.000032
planas_albareda_2019_be__br,0.000032
maset_vintage_2015_bn_r,0.000031
portium_br,0.000030


Save it into a pickle file

In [16]:
file_to_store = open("CBR.pickle", "wb")
pickle.dump(predictor, file_to_store)

file_to_store.close()

Load it again and check that everything works fine:

In [17]:
file_to_read = open("CBR.pickle", "rb")
loaded_predictor = pickle.load(file_to_read)
file_to_read.close()

In [32]:
loaded_predictor.predict("mestres_visol_2013_be_gr_bn").head(5).to_dict()

{'CS': {'pares_balt_blanca_cusine_2013_be_gr_bn': 0.8660265766385883,
  'agusti_torello_mata_kripta_2013_be_gr_bn': 0.8164773527845114,
  'giro_ribot_mare_2017_be_gr_bn': 0.7938602459771844,
  'segura_viudas_reserva_heredad_2015_be_gr_br': 0.7715188975541277,
  'vives_ambrs_tradicio_magnum_2012_besp_gr': 0.7715170860692356}}

In [19]:
loaded_predictor.predict("jaspi_negre_2017_t")

Unnamed: 0_level_0,CS
id,Unnamed: 1_level_1
can_blau_2019_t,0.880709
vespres_2018_t,0.846159
scala_dei_prior_2017_t_c,0.815379
el_brindis_2018_t,0.800647
vinya_gaso_2017_t_c,0.800647
...,...
blanc_giro_del_gorner_2020_b,0.000028
plana_den_fonoll_sauvignon_blanc_2019_b,0.000028
laqvarta_blanc_2019_b,0.000028
maset_vintage_2015_bn_r,0.000027


## Given a style, wine type and DO calculate the most similar (if not equal wine)

### Load the data

In [20]:
ITEMS_PATH = r"../EDA/CBRDO_OneHot.xlsx"
represented_items = pd.read_excel(ITEMS_PATH)

## Simple CBR
### Cosine Similarity matrix

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
features = represented_items.columns.tolist()[2:]

In [23]:
similarity = pd.DataFrame(cosine_similarity(represented_items[features]))
similarity["wine_id"] = represented_items.id

In [24]:
all_wine_data = represented_items[features]

example = ["espumoso","Cava  D.O.  / D.O.P."]

wine_vector = np.zeros(len(features))

i=0
for feature in features:
    if feature in example: 
        wine_vector[i] = 1
    
    i += 1

num_similarity_vector = all_wine_data.values @ wine_vector.T
den_similarity_vector = np.linalg.norm(all_wine_data.values, axis=1) * np.linalg.norm(wine_vector)
similarity = num_similarity_vector / den_similarity_vector
similarity.max()

np.flip(np.argsort(similarity,))

array([2428, 2707, 2043, ..., 1389, 1390,    0], dtype=int64)

In [25]:
cbr_do = CBRDO(represented_items,features)
example = ['Priorat  D.O.  Ca.  / D.O.P.',"tinto","corpulento"]
cbr_do.predict(example)

Unnamed: 0_level_0,CS
id,Unnamed: 1_level_1
gratallops_5_partides_2015_t,0.000223
scala_dei_sant_antoni_2016_t,0.000223
scala_dei_masdeu_2016_t,0.000223
de_muller_carinyena_2018_t_c,0.000223
de_sol_a_sol_2018_t,0.000223
...,...
sepo_2018_b,0.000000
or_peneds_2018_b,0.000000
parato_2018_be_r_bn,0.000000
mistinguett_2018_bn,0.000000


In [26]:
file_to_store = open("CBRDO.pickle", "wb")
pickle.dump(cbr_do, file_to_store)

file_to_store.close()