# Data Loading

In [None]:
from config import *
import pandas as pd
import numpy as np
import faiss
df = pd.read_csv(EMBEDED_FILEPATH)
embeddings = pd.read_csv("embeddings.csv")
display(df.head())

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,0.029798,0.017682,-0.022647,0.004759,0.012863,-0.043871,-0.042991,-0.003018,0.010959,0.011615
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.006305,-0.006726,0.027798,-0.004193,-0.048093,-0.010835,0.045763,0.016072,-0.026057,0.03303
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,-0.02076,0.017468,0.018908,-0.02481,0.016703,-0.013988,0.06241,0.003649,-0.027548,0.035012
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,-0.01186,-0.006513,0.016246,0.003879,-0.013565,-0.035242,-0.014264,0.005369,-0.000167,0.021132


In [None]:
if embeddings.isna().values.any():
    print("Uwaga: Wykryto wartości NaN w danych! Zamieniam je na 0.")
    embeddings = embeddings.fillna(0)

# walidacja czy nie ma NaNów w embeddingach

In [15]:
# szybki, skrótowy preprocessing usuwanie NaNów z price oraz zostawienie tylko wartosci liczbowych
df_nums = df.select_dtypes(include=['number'])
df_nums = df_nums.dropna(subset=['price']).reset_index(drop=True)

df_nums = df_nums.astype('float32')
display(df_nums.head())

Unnamed: 0,points,price,svd_0,svd_1,svd_2,svd_3,svd_4,svd_5,svd_6,svd_7,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
0,87.0,15.0,0.237418,-0.041084,-0.201485,0.149151,0.033628,-0.056647,-0.055995,0.03793,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
1,87.0,14.0,0.079108,-0.101362,0.064159,-0.020764,0.00891,0.008126,-0.034205,-0.061478,...,-0.006305,-0.006726,0.027798,-0.004193,-0.048093,-0.010835,0.045763,0.016072,-0.026057,0.03303
2,87.0,13.0,0.108375,-0.031683,0.125713,-0.06914,-0.066465,-0.033419,-0.024513,0.010203,...,-0.02076,0.017468,0.018908,-0.02481,0.016703,-0.013988,0.06241,0.003649,-0.027548,0.035012
3,87.0,65.0,0.056858,0.016821,0.003338,-0.074715,0.016867,-0.062519,-0.035536,-0.030929,...,-0.01186,-0.006513,0.016246,0.003879,-0.013565,-0.035242,-0.014264,0.005369,-0.000167,0.021132
4,87.0,15.0,0.157453,0.028841,0.048381,-0.05925,0.042049,-0.104221,-0.122209,0.006445,...,-0.009619,0.016572,-0.021579,-0.031811,0.020294,0.000683,0.02208,-0.000966,0.030221,-0.029566


## Using FAISS to find nearest wine rewiews (building Search Model)

In [28]:
def find_similar_items(query_vector: np.ndarray, k: int):
    D, I = index.search(query_vector.reshape(1, -1), k) # type: ignore
    return D, I

def get_wines_by_indices(indices: np.ndarray):
    wines_reviews_df = df.iloc[indices.flatten()]
    descriptions = wines_reviews_df['description'].tolist()
    display(descriptions)
    return wines_reviews_df

In [30]:
import numpy as np
import faiss
from config import *

df_nums_array = df_nums.to_numpy()
normy = np.linalg.norm(df_nums_array, axis=1)
indeksy_zerowe = np.where(normy == 0)[0]

if len(indeksy_zerowe) > 0:
    print(f"Uwaga: Znaleziono {len(indeksy_zerowe)} win z pustymi wektorami.")
    # Rozwiązanie: Dodajemy minimalną wartość (epsilon), żeby uniknąć dzielenia przez 0
    # Dzięki temu wektor będzie "prawie zerowy", ale normalizacja zadziała.
    df_nums_array[indeksy_zerowe] += 1e-10

# Upewniamy się, że tablica jest C-contiguous dla FAISS
df_nums_array = np.ascontiguousarray(df_nums_array)
faiss.normalize_L2(df_nums_array)

wymiar = df_nums_array.shape[1]
index = faiss.IndexFlatIP(wymiar)
index.add(df_nums_array) # type: ignore


D, I = find_similar_items(df_nums_array[1], k=5)
display(get_wines_by_indices(I))

print(f"Odległości: {D}")
print(f"Indeksy: {I}")

# Interpretacja odległości kosinusowej (ang. cosine similarity), u nas D to właśnie macierz podobieństw kosinusowych:
# 1.0 = identyczne (kąt 0 stopni).
# 0.0 = wektory prostopadłe (kąt 90 stopni).
# -1.0 = wektory przeciwne (kąt 180 stopni).

["This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'Crisp with acidity and yeasty on the palate with a sourdough tartness, this is a polished bubbly with interesting flavors of limes and strawberries. The mousse is a bit rough. Drink now.',
 "This gorgeous, full-bodied and compelling wine is made from 84% Chardonnay and 16% Pinot Noir. It has a rich golden color, slightly earthy aromas, plus bread dough and white-peach scents that lead to very opulent, sophisticated and nicely aged flavors that range from toasted walnut to honeyed peach to white pepper. While it's already nine years old, this will easily age through 2022.",
 'It reveals aromas of rose, tilled earth, and smoky mineral. The bright palate doles out crushed red cherry and black raspberry accented with herbs, white pepper and clove. Juicy flavor

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
49094,US,Crisp with acidity and yeasty on the palate wi...,Cuvée 20,87,28.0,California,Sonoma County,Sonoma,,,...,-0.018066,-0.003351,-0.019404,-0.022084,0.003461,-0.022179,-0.010955,0.021783,0.019451,0.006446
15188,US,"This gorgeous, full-bodied and compelling wine...",J. Schram,95,120.0,California,North Coast,North Coast,Jim Gordon,@gordone_cellars,...,-0.043883,0.028384,-0.024704,-0.014055,-0.022899,0.002964,0.004055,0.020119,-0.02932,0.003382
2519,Italy,"It reveals aromas of rose, tilled earth, and s...",Cannubi,94,70.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,...,0.007958,-0.005746,-0.046806,0.01841,0.003055,0.077244,-0.012154,-0.010603,-0.03404,0.01191
91725,US,"Salmon-pink in color, this 100% Chambourcin ro...",,86,18.0,Virginia,Virginia,,Alexander Peartree,,...,-0.021176,0.005219,-0.00789,0.020017,-0.003114,-0.00539,-0.038563,-0.002174,0.008372,0.00443


Odległości: [[0.99999994 0.9999931  0.9999919  0.9999915  0.9999907 ]]
Indeksy: [[    1 49094 15188  2519 91725]]
