# TF_IDF model

## Data Loading

In [4]:
from config import *
import pandas as pd
import numpy as np
import faiss
df = pd.read_csv(EMBEDED_FILEPATH)
embeddings = pd.read_csv("embeddings.csv")
display(df.head())

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,0.029798,0.017682,-0.022647,0.004759,0.012863,-0.043871,-0.042991,-0.003018,0.010959,0.011615
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.006305,-0.006726,0.027798,-0.004193,-0.048093,-0.010835,0.045763,0.016072,-0.026057,0.03303
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,-0.02076,0.017468,0.018908,-0.02481,0.016703,-0.013988,0.06241,0.003649,-0.027548,0.035012
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.01186,-0.006513,0.016246,0.003879,-0.013565,-0.035242,-0.014264,0.005369,-0.000167,0.021132


## Preprocess

In [22]:
if embeddings.isna().values.any():
    print("Uwaga: Wykryto wartości NaN w danych! Zamieniam je na 0.")
    embeddings = embeddings.fillna(0)

# walidacja czy nie ma NaNów w embeddingach

NameError: name 'embeddings' is not defined

In [1]:
# szybki, skrótowy preprocessing usuwanie NaNów z price oraz zostawienie tylko wartosci liczbowych
df_nums = df.select_dtypes(include=['number'])
df_nums = df_nums.dropna(subset=['price']).reset_index(drop=True)

df_nums = df_nums.astype('float32')
display(df_nums.head())

NameError: name 'df' is not defined

In [9]:
# to do function for normal preprocess
def fast_preprocess_df(df):
    df_nums = df.select_dtypes(include=['number'])
    df_nums = df_nums.dropna(subset=['price']).reset_index(drop=True)
    df_nums = df_nums.astype('float32')
    return df_nums

## Using FAISS to find nearest wine rewiews (building Search Model)

In [10]:
import faiss
import numpy as np
from config import *

def find_similar_items(query_vector: np.ndarray, k: int):
    D, I = index.search(query_vector.reshape(1, -1), k) # type: ignore
    return D, I

def get_wines_by_indices(indices: np.ndarray, df: pd.DataFrame):
    wines_reviews_df = df.iloc[indices.flatten()]
    descriptions = wines_reviews_df['description'].tolist()
    display(descriptions)
    return wines_reviews_df

def build_faiss_index_from_df_nums(df_nums: pd.DataFrame): 
    df_nums_array = df_nums.to_numpy()
    normy = np.linalg.norm(df_nums_array, axis=1)
    indeksy_zerowe = np.where(normy == 0)[0]

    if len(indeksy_zerowe) > 0:
        print(f"Uwaga: Znaleziono {len(indeksy_zerowe)} win z pustymi wektorami.")
        # Rozwiązanie: Dodajemy minimalną wartość (epsilon), żeby uniknąć dzielenia przez 0
        # Dzięki temu wektor będzie "prawie zerowy", ale normalizacja zadziała.
        df_nums_array[indeksy_zerowe] += 1e-10

    # Upewniamy się, że tablica jest C-contiguous dla FAISS
    df_nums_array = np.ascontiguousarray(df_nums_array)
    faiss.normalize_L2(df_nums_array)
    dimensions = df_nums_array.shape[1]
    
    index = faiss.IndexFlatL2(dimensions)  
    index.add(df_nums_array)   # type: ignore
    return index, df_nums_array

In [None]:
index, df_nums_array = build_faiss_index_from_df_nums(df_nums)
D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I, df))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

# Interpretacja odległości kosinusowej (ang. cosine similarity), u nas D to właśnie macierz podobieństw kosinusowych:
# 1.0 = identyczne (kąt 0 stopni).
# 0.0 = wektory prostopadłe (kąt 90 stopni).
# -1.0 = wektory przeciwne (kąt 180 stopni).

["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'Crisp with acidity and yeasty on the palate with a sourdough tartness, this is a polished bubbly with interesting flavors of limes and strawberries. The mousse is a bit rough. Drink now.',
 "This gorgeous, full-bodied and compelling wine is made from 84% Chardonnay and 16% Pinot Noir. It has a rich golden color, slightly earthy aromas, plus bread dough and white-peach scents that lead to very opulent, sophisticated and nicely aged flavors that range from toasted walnut to honeyed peach to white pepper. While it's already nine years old, this will easily age through 2022.",
 'It reveals aromas of rose, tilled earth, and smoky mineral. The bright palate doles out crushed red cherry and black raspberry accented with herbs, white pepper and clove. Juicy flavor

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
49094,US,Crisp with acidity and yeasty on the palate wi...,Cuvée 20,87,28.0,California,Sonoma County,Sonoma,,,...,-0.018066,-0.003351,-0.019404,-0.022084,0.003461,-0.022179,-0.010955,0.021783,0.019451,0.006446
15188,US,"This gorgeous, full-bodied and compelling wine...",J. Schram,95,120.0,California,North Coast,North Coast,Jim Gordon,@gordone_cellars,...,-0.043883,0.028384,-0.024704,-0.014055,-0.022899,0.002964,0.004055,0.020119,-0.02932,0.003382
2519,Italy,"It reveals aromas of rose, tilled earth, and s...",Cannubi,94,70.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,...,0.007958,-0.005746,-0.046806,0.01841,0.003055,0.077244,-0.012154,-0.010603,-0.03404,0.01191
91725,US,"Salmon-pink in color, this 100% Chambourcin ro...",,86,18.0,Virginia,Virginia,,Alexander Peartree,,...,-0.021176,0.005219,-0.00789,0.020017,-0.003114,-0.00539,-0.038563,-0.002174,0.008372,0.00443


Odległości: [[0.99999994 0.9999931  0.9999919  0.9999915  0.9999907 ]]
Indeksy: [[    1 49094 15188  2519 91725]]


## TF-IDF Bigrams

In [12]:
import numpy as np
import pandas as pd

embeddings_tf_idf_bigrams = np.load("embeddings_tf_idf_bigrams.npy")
embeddings_tf_idf_bigrams_df = pd.DataFrame(embeddings_tf_idf_bigrams)
display(embeddings_tf_idf_bigrams_df.head(), embeddings_tf_idf_bigrams_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.088711,-0.068887,-0.079682,-0.018129,-0.009653,-0.061104,0.007078,-0.007166,-0.035704,0.049989,...,0.00668,0.004052,0.009933,0.009689,-0.018292,0.003419,-0.022119,-0.020756,-0.007694,0.01215
1,0.154284,0.011388,0.158704,-0.078038,-0.090071,0.03702,0.055612,0.041271,-0.082306,-0.029474,...,-0.001607,0.007284,-0.00302,-0.009733,-0.022264,-0.001541,0.029729,0.006127,-0.003681,0.01889
2,0.065466,-0.094751,-0.029274,0.011576,-0.006152,0.008905,0.013507,-0.057787,0.006096,0.060042,...,0.014752,0.023071,0.005033,0.010739,-0.000433,-0.017879,0.004378,-0.004585,-0.01833,0.028156
3,0.051657,-0.056965,-0.044632,0.016064,0.039736,-0.021166,0.016663,-0.00441,-0.004401,-0.002667,...,-0.001929,0.034152,-0.014469,0.001856,-0.001094,-0.018169,-0.007891,-0.019679,-0.028118,-0.008796
4,0.045352,0.010833,-0.007543,0.011982,0.035781,0.040079,0.04908,-0.043692,0.01617,-0.02593,...,-0.008363,-0.020048,0.027488,0.009093,-0.023184,0.012048,0.001023,0.011309,-0.011669,-5e-06


(129971, 128)

In [13]:
from config import *
df_unchanged = pd.read_csv(CSV_FILEPATH, index_col=0)
embeddings_tf_idf_bigrams_df = pd.concat([df_unchanged, embeddings_tf_idf_bigrams_df], axis=1)

display(embeddings_tf_idf_bigrams_df.head(), embeddings_tf_idf_bigrams_df.shape)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,118,119,120,121,122,123,124,125,126,127
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,0.00668,0.004052,0.009933,0.009689,-0.018292,0.003419,-0.022119,-0.020756,-0.007694,0.01215
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,-0.001607,0.007284,-0.00302,-0.009733,-0.022264,-0.001541,0.029729,0.006127,-0.003681,0.01889
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,0.014752,0.023071,0.005033,0.010739,-0.000433,-0.017879,0.004378,-0.004585,-0.01833,0.028156
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,-0.001929,0.034152,-0.014469,0.001856,-0.001094,-0.018169,-0.007891,-0.019679,-0.028118,-0.008796
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.008363,-0.020048,0.027488,0.009093,-0.023184,0.012048,0.001023,0.011309,-0.011669,-5e-06


(129971, 141)

In [14]:
df_nums = fast_preprocess_df(embeddings_tf_idf_bigrams_df)
index, df_nums_array = build_faiss_index_from_df_nums(df_nums)
D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I, embeddings_tf_idf_bigrams_df))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'This sour, beery wine is for those who like a high acid flush with their meal. The very tart flavors focus on lemon and green apple.',
 'It reveals aromas of rose, tilled earth, and smoky mineral. The bright palate doles out crushed red cherry and black raspberry accented with herbs, white pepper and clove. Juicy flavors are balanced by vibrant energy and polished tannins. Drink 2018–2038.',
 'Admirable for its consistency from year to year, the B is always a safe choice. It adds a layer of creamy texture to bold passion fruit aromas and ripe citrusy flavors. Best consumed before the end of 2014.',
 'Underbrush, grilled porcini, red berry and grilled herb aromas lead the way. The bracing palate offers dried wild cherry, clove, cinnamon and powdered sage al

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,118,119,120,121,122,123,124,125,126,127
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,-0.001607,0.007284,-0.00302,-0.009733,-0.022264,-0.001541,0.029729,0.006127,-0.003681,0.01889
61407,US,"This sour, beery wine is for those who like a ...",Estate Grown,83,15.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.005935,0.010692,0.018393,0.021985,-0.004342,-0.016941,-0.014396,0.015956,-0.025171,0.025822
2519,Italy,"It reveals aromas of rose, tilled earth, and s...",Cannubi,94,70.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,...,-0.02787,0.013347,0.002587,0.000333,-0.006814,0.035137,-0.029173,-0.051599,0.022247,-0.012982
118185,New Zealand,Admirable for its consistency from year to yea...,Letter Series B,89,25.0,Marlborough,,,Joe Czerwinski,@JoeCz,...,-0.005749,0.019357,-0.023475,0.00463,0.003936,-0.000318,0.010141,0.005402,-0.005626,0.006455
99745,Italy,"Underbrush, grilled porcini, red berry and gri...",,89,50.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,...,-0.024756,0.035717,0.013911,0.045812,0.018889,0.019567,-0.056029,-0.060592,-9.6e-05,-0.039339


Odległości: [[0.0000000e+00 1.2087260e-05 1.2416787e-05 1.2642055e-05 1.3069872e-05]]
Indeksy: [[     1  61407   2519 118185  99745]]


# NLP Models

## all-MiniLM-L6-v2

In [3]:
import numpy as np
import pandas as pd

embeddings_MiniLM_L6_v2 = np.load("embeddings_all-MiniLM-L6-v2.npy")
embeddings_MiniLM_L6_v2_df = pd.DataFrame(embeddings_MiniLM_L6_v2)
display(embeddings_MiniLM_L6_v2_df.head(), embeddings_MiniLM_L6_v2_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.038659,-0.014399,0.068146,0.046163,0.00175,0.036808,0.007584,-0.111908,0.055691,-0.013243,...,-0.061012,-0.006681,-0.012244,-0.056224,0.107565,-0.024622,0.063502,0.003625,0.01293,-0.022718
1,0.011211,-0.084605,-0.087883,0.060633,0.009146,0.088504,0.074859,-0.055802,-0.050464,-0.051533,...,-0.052223,0.019734,-8.7e-05,-0.031741,-0.030476,0.003709,0.050478,-0.025587,0.098917,-0.0697
2,-0.005502,-0.069589,0.034681,0.028227,0.022181,0.06213,0.034878,-0.024281,0.042304,-0.118203,...,0.049469,0.015914,-0.007621,0.000781,0.036931,0.057761,0.029773,-0.0291,0.079111,-0.00787
3,0.020835,-0.019097,0.042748,0.034504,-0.002234,0.096514,0.025273,-0.102333,0.070755,-0.071472,...,0.037056,-0.038256,-0.035956,-0.041817,0.081227,0.050092,0.046464,0.018701,0.022758,-0.01697
4,-0.031719,0.061518,-0.030642,0.01542,-0.034111,0.030671,-0.032406,-0.051424,-0.018782,-0.085505,...,-0.017992,0.058915,-0.058342,-0.020586,0.073731,-0.035804,0.004446,0.005601,0.018019,-0.039419


(129971, 384)

In [6]:
from config import *
df_unchanged = pd.read_csv(CSV_FILEPATH, index_col=0)
df_with_MiniLM_L6_v2 = pd.concat([df_unchanged, embeddings_MiniLM_L6_v2_df], axis=1)

display(df_with_MiniLM_L6_v2.head(), df_with_MiniLM_L6_v2.shape)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,374,375,376,377,378,379,380,381,382,383
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,-0.061012,-0.006681,-0.012244,-0.056224,0.107565,-0.024622,0.063502,0.003625,0.01293,-0.022718
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,-0.052223,0.019734,-8.7e-05,-0.031741,-0.030476,0.003709,0.050478,-0.025587,0.098917,-0.0697
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,0.049469,0.015914,-0.007621,0.000781,0.036931,0.057761,0.029773,-0.0291,0.079111,-0.00787
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,0.037056,-0.038256,-0.035956,-0.041817,0.081227,0.050092,0.046464,0.018701,0.022758,-0.01697
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.017992,0.058915,-0.058342,-0.020586,0.073731,-0.035804,0.004446,0.005601,0.018019,-0.039419


(129971, 397)

Finding neareast descriptions

In [None]:
df_nums = fast_preprocess_df(df_with_MiniLM_L6_v2)
index, df_nums_array = build_faiss_index_from_df_nums(df_nums)
D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I, df_with_MiniLM_L6_v2))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

["Zesty orange peels and apple notes abound in this sprightly, mineral-toned Riesling. Off dry on the palate, yet racy and lean, it's a refreshing, easy quaffer with wide appeal.",
 "Warm whiffs of cinnamon and brioche accent aromas of black cherry and plum on this plush, fruit-forward Saperavi. Plummy and soft with black-fruit flavor, it's a straightforward, yet tasty wine. Finishes with a fringe of fine, delicate tannins.",
 "Ripe, pristine pineapple, mango and honeydew burst from this luscious semi-dry Riesling. It's juicy and lush, boasting a tropical fruit flavor, but also structured and sprightly, with a surprising mineral finish.",
 "Toasty and dark-fruited, Huber's 2007 Bombacher Sommerhalde R falls somewhere between the Wildenstein and Malterdinger Bienenberg in style. It's less herbal and more powerful than the former, but not as muscular as the latter. The tannins are still silky, yet there's great intensity, ending on a long, slightly charred note. Drink 2012–2020.",
 'If y

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,374,375,376,377,378,379,380,381,382,383
15,Germany,Zesty orange peels and apple notes abound in t...,Devon,87,24.0,Mosel,,,Anna Lee C. Iijima,,...,0.002431,0.006473,0.023692,-0.005938,0.071909,0.006903,0.05341,0.045247,0.092051,0.016326
95639,US,Warm whiffs of cinnamon and brioche accent aro...,,86,30.0,New York,Finger Lakes,Finger Lakes,Anna Lee C. Iijima,,...,-0.038991,-0.010259,0.020578,0.010205,0.008803,-0.058263,0.105315,-0.020112,0.003724,-0.050101
118462,US,"Ripe, pristine pineapple, mango and honeydew b...",Semi-Dry,89,15.0,New York,Finger Lakes,Finger Lakes,Anna Lee C. Iijima,,...,0.004632,0.00521,0.03401,-0.000977,0.016436,0.07721,0.081363,-0.044072,0.043205,-0.014012
100047,Germany,"Toasty and dark-fruited, Huber's 2007 Bombache...",Bombacher Sommerhalde R Trocken,89,59.0,Baden,,,Joe Czerwinski,@JoeCz,...,-0.03943,0.023724,-0.054245,-0.014184,-0.036665,-0.074978,0.094981,-0.083312,0.017913,-0.027466
5561,Italy,"If you love Italian whites, Confini is definit...",Confini,90,37.0,Northeastern Italy,Venezia Giulia,,,,...,0.04794,-0.086918,-0.02743,0.066182,0.063675,0.065997,0.045509,0.026031,-0.002397,-0.033054


Odległości: [[0.0000000e+00 0.0000000e+00 4.9411017e-05 5.7736434e-05 6.5522923e-05]]
Indeksy: [[    15  95639 118462 100047   5561]]


## all-mpnet-base_v2

In [16]:
embeddings_mpnet_base_v2 = np.load("embeddings_all-mpnet-base-v2.npy")
embeddings_mpnet_base_v2_df = pd.DataFrame(embeddings_mpnet_base_v2)
display(embeddings_mpnet_base_v2_df.head(), embeddings_mpnet_base_v2_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.036825,0.040504,-0.049127,0.01206,-0.017809,0.04519,-0.047158,0.027785,0.037616,0.005221,...,-0.018225,-0.022451,-0.030674,0.030395,-0.036411,0.07518,-0.02167,0.017032,-0.024503,-0.021728
1,0.0154,0.07619,-0.017931,0.004965,-0.076234,0.021419,-0.074819,0.003389,0.057341,-0.003475,...,-0.071278,0.033845,-0.002066,0.018129,-0.004691,0.028105,-0.04277,0.010317,0.063025,-0.026926
2,0.032447,0.012291,-0.034564,-0.020621,-0.04829,0.031377,-0.082451,0.022635,0.002648,0.024966,...,-0.126703,0.035472,0.029064,0.006745,-0.012356,0.030802,-0.012589,0.024285,0.023172,-0.0172
3,0.076041,0.044183,-0.017158,-0.041186,-0.030582,0.051051,-0.076459,0.011513,0.011916,0.014672,...,-0.063876,-0.010313,0.004829,0.009389,-0.011263,0.035524,-0.014419,0.030267,-0.036529,-0.043187
4,0.011352,0.057223,-0.023293,0.022438,-0.078027,0.046609,-0.085686,-0.006774,0.038266,-0.01428,...,-0.058865,0.031108,0.006709,0.020604,-0.017219,0.023554,-0.039825,0.021364,0.025453,-0.0223


(129971, 768)

In [17]:
df_unchanged = pd.read_csv(CSV_FILEPATH, index_col=0)
df_with_mpnet_base_v2 = pd.concat([df_unchanged, embeddings_mpnet_base_v2_df], axis=1)

display(df_with_mpnet_base_v2.head(), df_with_mpnet_base_v2.shape)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,758,759,760,761,762,763,764,765,766,767
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,-0.018225,-0.022451,-0.030674,0.030395,-0.036411,0.07518,-0.02167,0.017032,-0.024503,-0.021728
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,-0.071278,0.033845,-0.002066,0.018129,-0.004691,0.028105,-0.04277,0.010317,0.063025,-0.026926
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.126703,0.035472,0.029064,0.006745,-0.012356,0.030802,-0.012589,0.024285,0.023172,-0.0172
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,-0.063876,-0.010313,0.004829,0.009389,-0.011263,0.035524,-0.014419,0.030267,-0.036529,-0.043187
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.058865,0.031108,0.006709,0.020604,-0.017219,0.023554,-0.039825,0.021364,0.025453,-0.0223


(129971, 781)

In [21]:
df_nums = fast_preprocess_df(df_with_mpnet_base_v2)
index, df_nums_array = build_faiss_index_from_df_nums(df_nums)
D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I, df_with_mpnet_base_v2))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'This well-priced blend of Sangiovese (70%) and Merlot offers friendly and approachable aromas of ripe berry fruit, blackberry jam, spice and light touches of leather and tobacco. The fresh finish makes it a perfect Tuscan red to pair with ravioli stuffed with minced meat or cheese.',
 "Brisk in acidity, this everyday Sauvignon Blanc has ripe, bright flavors of tangerines, peaches and pineapples. It's a good by-the-glass wine for restaurants.",
 'Flowers, melon and stone fruit inform the aromas and flavors of this unctuous, weighty and viscous wine, smooth with a lingering taste of ripe peach on the finish. Tiny percentages of Riesling and Gewürztraminer figure into the mix, too.',
 "Black plum, blackberries, lilac and vanilla show on the easy if simple nos

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,758,759,760,761,762,763,764,765,766,767
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,-0.071278,0.033845,-0.002066,0.018129,-0.004691,0.028105,-0.04277,0.010317,0.063025,-0.026926
34375,Italy,This well-priced blend of Sangiovese (70%) and...,Badiola,88,15.0,Tuscany,Toscana,,,,...,-0.083484,4e-06,0.017295,0.000791,-0.027961,0.034929,-0.026091,-0.005245,0.062641,-0.009963
36051,US,"Brisk in acidity, this everyday Sauvignon Blan...",,86,19.0,California,Santa Ynez Valley,Central Coast,,,...,-0.052993,-0.011641,0.053492,0.028766,-0.026909,0.012359,-0.057818,0.014286,0.038461,-0.058916
50218,US,"Flowers, melon and stone fruit inform the arom...",The Party Starter,87,32.0,California,Sonoma Valley,Sonoma,Virginie Boone,@vboone,...,-0.082196,0.009864,0.02264,0.008464,-0.017869,0.062644,-0.047406,-0.007593,0.044636,-0.054881
43229,US,"Black plum, blackberries, lilac and vanilla sh...",,87,32.0,California,Santa Barbara County,Central Coast,Matt Kettmann,@mattkettmann,...,-0.078129,0.002851,0.014643,0.000296,-0.013496,0.049669,-0.015979,0.024735,-0.040597,-0.03514


Odległości: [[0.0000000e+00 5.2505053e-05 5.2735049e-05 5.2821702e-05 5.4611606e-05]]
Indeksy: [[    1 34375 36051 50218 43229]]


## OpenAI Embeddings API

In [2]:
import numpy as np
import pandas as pd

embeddings_Open_AI_api = np.load("embeddings_open_ai_api.npy")
embeddings_Open_AI_api_df = pd.DataFrame(embeddings_Open_AI_api)
display(embeddings_Open_AI_api_df.head(), embeddings_Open_AI_api_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,-0.015988,0.034856,0.009049,0.008968,-0.024491,0.006393,0.001542,-0.011916,0.042552,0.031181,...,-0.001461,0.019401,0.028897,-0.006622,0.001174,-0.000673,0.039126,-0.018992,0.00342,-0.013108
1,-0.033432,0.022581,-0.037627,0.000395,0.022412,-0.025581,0.001617,0.039071,0.00324,-0.032033,...,0.001768,-0.019107,0.01966,-0.013772,0.004644,-0.00472,0.022807,0.020652,0.022175,0.00421
2,-0.012219,-0.017575,-0.022989,0.001681,0.038211,-0.024461,0.023117,0.023105,-0.017772,-0.024461,...,-0.001894,0.00917,0.025412,-0.011315,0.012045,-0.004472,0.030258,-0.012474,0.04985,-0.005594
3,0.007898,0.022917,0.013069,0.018005,-0.013612,-0.008882,0.014614,0.032269,0.022784,0.013685,...,0.019815,-0.011772,0.013178,-0.033355,0.007023,0.006148,0.018777,0.005071,0.021155,0.002417
4,-0.037397,-0.012672,-0.015235,-0.011752,-0.026789,-0.043941,0.043915,0.020232,0.00447,-0.002678,...,0.042356,-0.010007,0.014762,-0.000711,-0.02399,-0.028092,0.010845,0.013056,0.004186,0.013497


(129971, 1536)

In [4]:
df_unchanged = pd.read_csv(CSV_FILEPATH, index_col=0)
embeddings_Open_AI_api_df = pd.concat([df_unchanged, embeddings_Open_AI_api_df], axis=1)

display(embeddings_Open_AI_api_df.head(), embeddings_Open_AI_api_df.shape)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,-0.001461,0.019401,0.028897,-0.006622,0.001174,-0.000673,0.039126,-0.018992,0.00342,-0.013108
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.001768,-0.019107,0.01966,-0.013772,0.004644,-0.00472,0.022807,0.020652,0.022175,0.00421
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.001894,0.00917,0.025412,-0.011315,0.012045,-0.004472,0.030258,-0.012474,0.04985,-0.005594
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,0.019815,-0.011772,0.013178,-0.033355,0.007023,0.006148,0.018777,0.005071,0.021155,0.002417
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,0.042356,-0.010007,0.014762,-0.000711,-0.02399,-0.028092,0.010845,0.013056,0.004186,0.013497


(129971, 1549)

In [9]:
df_nums = fast_preprocess_df(embeddings_Open_AI_api_df)
index, df_nums_array = build_faiss_index_from_df_nums(df_nums)
D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I, embeddings_Open_AI_api_df))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 "A strongly spicy, rich wine, this has great concentration and an intense, dense texture.There is weight with plenty of ripe tropical fruits to contrast pronounced black pepper and bitter almond character. It's still young, so don't drink before 2015. Screwcap.",
 'This wine was fermented and aged in French oak, 25% of it new. It shows a touch of nutmeg dotted along a light-bodied swath of pear, apple, peach and vanilla. The wine is deliciously simple and focused, with a short finish.',
 'Passion fruit, lime and feline aromas give the bouquet crispness and varietal snap. The palate is tangy, lean and somewhat dilute, with pithy grapefruit, lime and tarragon flavors. The finish shows modest cut but regular, weakly defined flavors.',
 'This is a serious, tann

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.001768,-0.019107,0.01966,-0.013772,0.004644,-0.00472,0.022807,0.020652,0.022175,0.00421
280,Austria,"A strongly spicy, rich wine, this has great co...",Schiefer Reserve,92,24.0,Kremstal,,,Roger Voss,@vossroger,...,0.016131,-0.000321,0.005764,0.004916,0.003226,-0.016202,-0.019832,0.004128,0.048677,-0.029913
91151,US,This wine was fermented and aged in French oak...,Golden Heart,88,45.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,...,0.008836,-0.0082,-0.007888,-0.014133,-0.000328,0.015705,-0.001959,-0.043766,0.061426,-0.013941
63051,Chile,"Passion fruit, lime and feline aromas give the...",Reserva,85,12.0,Casablanca Valley,,,Michael Schachner,@wineschach,...,0.010583,0.022575,0.022952,-0.002591,0.008267,-0.002383,0.013336,0.026623,0.005091,-0.004395
33610,France,"This is a serious, tannic wine that will need ...",,91,,Bordeaux,Saint-Émilion,,Roger Voss,@vossroger,...,0.009623,-0.005685,0.012036,-0.004364,-0.004854,-0.01356,0.001653,0.033837,0.002424,-0.008969


Odległości: [[0.0000000e+00 8.0647325e-05 8.5327854e-05 8.5630549e-05 8.7048000e-05]]
Indeksy: [[    1   280 91151 63051 33610]]
