# Data Loading

In [1]:
import pandas as pd
df = pd.read_csv("winemag-data-130k-v2.csv", index_col=0)

# Building Embedings from tf-idf

In [2]:
# changing '-' into '_' so that tfidf can capture these words
wine_descriptions = df['description'].str.replace('-', '_', regex=False)
display(wine_descriptions[:5].tolist())

["Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",
 "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless_steel fermented.',
 'Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey_drizzled guava and mango giving way to a slightly astringent, semidry finish.',
 "Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy count

making stop words for tf-idf

In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

my_additional_stop_words = {'wine', 'drink', 'bottle', 'flavor', 'taste', 'like', 'nose', 'palate', 'finish', 'aroma', 'notes', 'note', 'vineyard', 'shows', 'alongside', 'offers', 'feels'}
all_stop_words = list(ENGLISH_STOP_WORDS.union(my_additional_stop_words))

In [4]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from config import *

# Tworzenie macierzy TF-IDF
vectorizer = TfidfVectorizer(
    max_features=50000,
    min_df=MIN_WORD_OCCURENCE,
    stop_words=all_stop_words,
    max_df=0.3,
    #token_pattern=r'(?u)\b[\w-]{2,}\b'
    ngram_range=(1, 2) # test z bigramami
)
display(MIN_WORD_OCCURENCE)
tfidf_matrix = vectorizer.fit_transform(wine_descriptions)
terms = vectorizer.get_feature_names_out()
print(f"Wymiary macierzy TF-IDF: {tfidf_matrix.shape}")

svd = TruncatedSVD(n_components=DIMENSIONS, random_state=RANDOM_STATE)
embeddings = svd.fit_transform(tfidf_matrix)
print(f"Wymiary embeddingów: {embeddings.shape}")

10

Wymiary macierzy TF-IDF: (129971, 44820)
Wymiary embeddingów: (129971, 128)


In [None]:
terms[:50]

array(['10', '100', '15', '20', '2015', '2016', '2017', '2018', '2019',
       '2020', '2022', '2025', 'accent', 'accented', 'accents', 'acidic',
       'acidity', 'acids', 'add', 'adds', 'aftertaste', 'age', 'aged',
       'aging', 'alcohol', 'almond', 'ample', 'anise', 'appeal',
       'appealing', 'appellation', 'apple', 'apples', 'approachable',
       'apricot', 'aromatic', 'astringent', 'attractive', 'backed',
       'background', 'baked', 'baking', 'balance', 'balanced', 'barrel',
       'bean', 'beautiful', 'beautifully', 'beef', 'berries'],
      dtype=object)

# Saving Embeddings

## Slow way to save embeddings in csv

In [5]:
col_names = [f"svd_{i}" for i in range(DIMENSIONS)]
df_embeddings = pd.DataFrame(embeddings, columns=col_names)
df_embeddings = df_embeddings.astype('float32')
df_embeddings.to_csv("embeddings_tf_idf_bigrams.csv", index=False)

display(df_embeddings.head())

Unnamed: 0,svd_0,svd_1,svd_2,svd_3,svd_4,svd_5,svd_6,svd_7,svd_8,svd_9,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
0,0.088711,-0.068887,-0.079682,-0.018129,-0.009653,-0.061104,0.007078,-0.007166,-0.035704,0.049989,...,0.00668,0.004052,0.009933,0.009689,-0.018292,0.003419,-0.022119,-0.020756,-0.007694,0.01215
1,0.154284,0.011388,0.158704,-0.078038,-0.090071,0.03702,0.055612,0.041271,-0.082306,-0.029474,...,-0.001607,0.007284,-0.00302,-0.009733,-0.022264,-0.001541,0.029729,0.006127,-0.003681,0.01889
2,0.065466,-0.094751,-0.029274,0.011576,-0.006152,0.008905,0.013507,-0.057787,0.006096,0.060042,...,0.014752,0.023071,0.005033,0.010739,-0.000433,-0.017879,0.004378,-0.004585,-0.01833,0.028156
3,0.051657,-0.056965,-0.044632,0.016064,0.039736,-0.021166,0.016663,-0.00441,-0.004401,-0.002667,...,-0.001929,0.034152,-0.014469,0.001856,-0.001094,-0.018169,-0.007891,-0.019679,-0.028118,-0.008796
4,0.045352,0.010833,-0.007543,0.011982,0.035781,0.040079,0.04908,-0.043692,0.01617,-0.02593,...,-0.008363,-0.020048,0.027488,0.009093,-0.023184,0.012048,0.001023,0.011309,-0.011669,-5e-06


## Efficient way to safe embeddings

In [6]:
import numpy as np

embeddings = embeddings.astype(np.float32, copy=False)
np.save("embeddings_tf_idf_bigrams.npy", embeddings)

## Comparing embedding file sizes

In [None]:
import os
import humanize

for p in ["embeddings.npy", "embeddings.csv"]:
    size = os.path.getsize(p)
    print(p, humanize.naturalsize(size, binary=False))

embeddings.npy 66.5 MB
embeddings.csv 202.7 MB


# Concatenating with df and saving

In [None]:
df_embeddings = df_embeddings.reset_index(drop=True)
df = df.reset_index(drop=True)
df_final = pd.concat([df, df_embeddings], axis=1)
df_final.to_csv(EMBEDED_FILEPATH, index=False)

# Building Embedings from tf-idf using pipeline

In [None]:
# Same thing but using pipeline
from sklearn.pipeline import Pipeline

# Create a pipeline with TfidfVectorizer and TruncatedSVD
pipeline = Pipeline([
  ('tfidf', TfidfVectorizer(max_features=50000, min_df=MIN_WORD_OCCURENCE, stop_words=all_stop_words, max_df=0.3, token_pattern=r'(?u)\b[\w-]{2,}\b')),
  ('pca', TruncatedSVD(n_components=DIMENSIONS, random_state=RANDOM_STATE))
])

# Fit and transform the data using the pipeline
svd_matrix = pipeline.fit_transform(wine_descriptions)
print("Shape of SVD matrix:", svd_matrix.shape)

Shape of SVD matrix: (129971, 128)


# Look for most "informative" words in clusters using K-NN to find how well tf-idf dealt with noise data (e.g this, that)

In [None]:
import numpy as np
from sklearn.cluster import KMeans

def get_top_words(k_final, num_words=10):
    kmeans_final = KMeans(n_clusters=k_final, random_state=RANDOM_STATE, n_init=10)
    predicted_labels = kmeans_final.fit_predict(embeddings)

    # korzystamy z predicted labels do znalezienia słów w tym celu użyje TfidfVectorizer bo w tej interpretacji mam macierz słów do występowania ich
    for i in range(k_final):
        cluster_mask = (predicted_labels == i)
        cluster_tfidf_matrix = tfidf_matrix[cluster_mask] # type: ignore

        # licze centroid (licząc średni dla i-tego klastra w cluster_tfidf_matrix)
        cluster_centroid = np.asarray(cluster_tfidf_matrix.mean(axis=0)).flatten()

        top_indices = cluster_centroid.argsort()[-num_words:][::-1]
        top_terms = [terms[idx] for idx in top_indices]

        print(f"\nCluster {i}:")
        print(f"  (Liczba dokumentów: {np.sum(cluster_mask)})")
        print(f"  Top {num_words} terminów: {', '.join(top_terms)}") # type: ignore

In [None]:
get_top_words(20, 15)


Cluster 0:
  (Liczba dokumentów: 5428)
  Top 15 terminów: crisp, acidity, fruity, fresh, bright, ready, aftertaste, character, texture, lively, attractive, fruits, light, ripe, tight

Cluster 1:
  (Liczba dokumentów: 6019)
  Top 15 terminów: cherry, tannins, red, berry, black, spice, clove, licorice, dried, leather, herb, firm, opens, raspberry, pepper

Cluster 2:
  (Liczba dokumentów: 5615)
  Top 15 terminów: berry, plum, spice, red, wild, cherry, tannins, earthy, oak, good, spicy, tannic, fruits, bright, baked

Cluster 3:
  (Liczba dokumentów: 3796)
  Top 15 terminów: bit, cherry, tannins, sweet, black, good, oak, red, dry, plum, green, spice, tart, raspberry, ripe

Cluster 4:
  (Liczba dokumentów: 8379)
  Top 15 terminów: black, cherry, pepper, tannins, plum, dark, spice, chocolate, currant, licorice, red, ripe, dried, rich, bottling

Cluster 5:
  (Liczba dokumentów: 5276)
  Top 15 terminów: soft, ripe, acidity, fruity, tannins, red, ready, fruits, texture, attractive, rounded, smo