# Data Loading

In [7]:
import pandas as pd
df = pd.read_csv("winemag-data-130k-v2.csv", index_col=0)

In [10]:
wine_descriptions = df['description'].tolist()

## Check how other models will be accurate with finding most simillar descriptions

# Sentence Transformers (SBERT) with models all-mpnet-base-v2, all-MiniLM-L6-v2

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

wine_descriptions = df['description'].tolist()

model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(wine_descriptions)
embeddings = embeddings.astype(np.float32, copy=False)
np.save("embeddings_all-mpnet-base-v2.npy", embeddings)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

wine_descriptions = df['description'].tolist()

model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(wine_descriptions)
embeddings = embeddings.astype(np.float32, copy=False)
np.save("embeddings_all-mpnet-base-v2.npy", embeddings)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Open Ai API embeddings

Check how accurate will embeddings from LLM like Open Ai be

In [2]:
%pip install openai

Collecting openai
  Using cached openai-2.15.0-py3-none-any.whl.metadata (29 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.12.1-py3-none-any.whl.metadata (4.3 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.10.0 (from openai)
  Using cached jiter-0.12.0-cp313-cp313-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Collecting annotated-types>=0.6.0 (fr


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import openai
import numpy as np
from tqdm import tqdm  # Pasek postępu
from private import API_KEY

client = openai.OpenAI(api_key=API_KEY)
MODEL = "text-embedding-3-small"
BATCH_SIZE = 2000

def get_embeddings_batched(texts, model=MODEL, batch_size=BATCH_SIZE):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        
        try:
            batch = [text.replace("\n", " ") for text in batch]
            
            response = client.embeddings.create(input=batch, model=model)
            
            batch_embeddings = [data.embedding for data in response.data]
            all_embeddings.extend(batch_embeddings)
            
        except Exception as e:
            print(f"Błąd przy paczce {i}: {e}")
            break

    return all_embeddings

print(f"Embeddings from wine_descriptions {len(wine_descriptions)}")
embeddings_list = get_embeddings_batched(wine_descriptions)
embeddings = np.array(embeddings_list).astype(np.float32, copy=False)
np.save("embeddings_open_ai_api.npy", embeddings)


Embeddings from wine_descriptions 129971


In [12]:
np.save("embeddings_open_ai_api.npy", embeddings)