# Data Loading

In [31]:
import pandas as pd
df = pd.read_csv("winemag-data-130k-v2.csv", index_col=0)

## Building Embedings from tf-idf

In [None]:
wine_descriptions = df['description'].tolist()

# changing '-' into '_' so that tfidf can capture these words
wine_descriptions = [desc.replace('-', '_') for desc in wine_descriptions] 
display(wine_descriptions[:5])

["Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",
 "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless_steel fermented.',
 'Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey_drizzled guava and mango giving way to a slightly astringent, semidry finish.',
 "Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy count

making stop words for tf-idf

In [38]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

my_additional_stop_words = {'wine', 'drink', 'bottle', 'flavor', 'taste', 'like', 'nose', 'palate', 'finish', 'aroma', 'notes', 'note', 'vineyard', 'shows', 'alongside', 'offers', 'feels'}
#all_stop_words = ENGLISH_STOP_WORDS.union(my_additional_stop_words)
all_stop_words = ENGLISH_STOP_WORDS

In [48]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from config import *

dim_num = 128

# Tworzenie macierzy TF-IDF
vectorizer = TfidfVectorizer(
    max_features=50000,
    min_df=6,
    stop_words='english',
    max_df=0.3,
    #ngram_range=(1, 2) # test z bigramami
    )
tfidf_matrix = vectorizer.fit_transform(wine_descriptions)
terms = vectorizer.get_feature_names_out()

print(f"Wymiary macierzy TF-IDF: {tfidf_matrix.shape}")
svd = TruncatedSVD(n_components=dim_num, random_state=RANDOM_STATE)
embeddings = svd.fit_transform(tfidf_matrix)

print(f"Wymiary embeddingów: {embeddings.shape}")


Wymiary macierzy TF-IDF: (129971, 12377)
Wymiary embeddingów: (129971, 128)


Saving Embeddings

In [52]:
col_names = [f"svd_{i}" for i in range(dim_num)]
df_embeddings = pd.DataFrame(embeddings, columns=col_names)
df_embeddings = df_embeddings.astype('float32')
df_embeddings.to_csv("embeddings.csv", index=False)

display(df_embeddings.head())

Unnamed: 0,svd_0,svd_1,svd_2,svd_3,svd_4,svd_5,svd_6,svd_7,svd_8,svd_9,...,svd_118,svd_119,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127
0,0.135796,-0.017455,0.147128,0.093195,-0.065306,0.053321,-0.000996,-0.012069,-0.018078,0.057595,...,0.029798,0.017682,-0.022647,0.004759,0.012863,-0.043871,-0.042991,-0.003018,0.010959,0.011615
1,0.237418,-0.041084,-0.201485,0.149151,0.033628,-0.056647,-0.055995,0.03793,-0.130253,-0.10381,...,0.002486,0.005251,0.040565,-0.011704,-0.002637,0.012367,-0.018191,-0.003912,0.031842,-0.010622
2,0.079108,-0.101362,0.064159,-0.020764,0.00891,0.008126,-0.034205,-0.061478,-0.002667,0.060124,...,-0.006305,-0.006726,0.027798,-0.004193,-0.048093,-0.010835,0.045763,0.016072,-0.026057,0.03303
3,0.108375,-0.031683,0.125713,-0.06914,-0.066465,-0.033419,-0.024513,0.010203,0.012309,-0.013557,...,-0.02076,0.017468,0.018908,-0.02481,0.016703,-0.013988,0.06241,0.003649,-0.027548,0.035012
4,0.056858,0.016821,0.003338,-0.074715,0.016867,-0.062519,-0.035536,-0.030929,0.00714,-0.020642,...,-0.01186,-0.006513,0.016246,0.003879,-0.013565,-0.035242,-0.014264,0.005369,-0.000167,0.021132


concatenating with df and saving

In [53]:
df_embeddings = df_embeddings.reset_index(drop=True)
df = df.reset_index(drop=True)
df_final = pd.concat([df, df_embeddings], axis=1)
df_final.to_csv("winemag-data-130k-v2-tfidf-svd.csv", index=False)

In [35]:
# Same thing but using pipeline
from sklearn.pipeline import Pipeline

# Create a pipeline with TfidfVectorizer and TruncatedSVD
pipeline = Pipeline([
  ('tfidf', TfidfVectorizer(max_features=50000, min_df=4, stop_words='english', max_df=0.3, token_pattern=r'(?u)\b[\w-]{2,}\b')),
  ('pca', TruncatedSVD(n_components=128, random_state=RANDOM_STATE))
])

# Fit and transform the data using the pipeline
svd_matrix = pipeline.fit_transform(wine_descriptions)
# Display the shape of the resulting matrix
print("Shape of SVD matrix:", svd_matrix.shape)

Shape of SVD matrix: (129971, 128)


# Look for most "informative" words in clusters using K-NN to find how well tf-idf dealt with noise data (e.g this, that)

In [40]:
import numpy as np
from sklearn.cluster import KMeans

def get_top_words(k_final, num_words=10):
  kmeans_final = KMeans(n_clusters=k_final, random_state=42, n_init=10)
  predicted_labels = kmeans_final.fit_predict(embeddings)
  # korzystamy z predicted labels do znalezienia słów w tym celu użyje TfidfVectorizer bo w tej interpretacji mam macierz słów do występowania ich

  for i in range(k_final):
      cluster_mask = (predicted_labels == i)
      cluster_tfidf_matrix = tfidf_matrix[cluster_mask] # type: ignore

      # licze centroid (licząć średni dla i tego klastra w cluster_tfidf_matrix)
      cluster_centroid = np.asarray(cluster_tfidf_matrix.mean(axis=0)).flatten()

      top_indices = cluster_centroid.argsort()[-num_words:][::-1]
      top_terms = [terms[idx] for idx in top_indices]

      print(f"\nCluster {i}:")
      print(f"  (Liczba dokumentów: {np.sum(cluster_mask)})")
      print(f"  Top {num_words} terminów: {', '.join(top_terms)}") # type: ignore

In [41]:
get_top_words(20, 15)


Cluster 0:
  (Liczba dokumentów: 6384)
  Top 15 terminów: acidity, crisp, fruity, character, drink, aftertaste, fruits, ripe, texture, attractive, fresh, soft, bright, tight, citrus

Cluster 1:
  (Liczba dokumentów: 5728)
  Top 15 terminów: berry, plum, herbal, finish, feels, palate, earthy, notes, oaky, oak, rubbery, spicy, baked, tannic, roasted

Cluster 2:
  (Liczba dokumentów: 5695)
  Top 15 terminów: sweet, acidity, finish, soft, ripe, like, vanilla, honey, rich, cherry, orange, spice, tastes, notes, palate

Cluster 3:
  (Liczba dokumentów: 3427)
  Top 15 terminów: ready, drink, acidity, fruity, ripe, fruits, soft, crisp, attractive, light, character, texture, fresh, rich, tannins

Cluster 4:
  (Liczba dokumentów: 2730)
  Top 15 terminów: simple, sweet, soft, cherry, fresh, finish, acidity, clean, citrus, raspberry, white, palate, fruity, light, peach

Cluster 5:
  (Liczba dokumentów: 3974)
  Top 15 terminów: pinot, noir, cherry, silky, cola, dry, raspberry, acidity, drink, cherr