# Data loading

In [2]:
import pandas as pd
df = pd.read_csv("winemag-data-130k-v2.csv", index_col=0)
df = df.drop(["taster_twitter_handle"], axis=1)

In [6]:
display(df.head())
display(df.at[0, "description"])

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

# Tokenizing

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

An attempt to create a sensible tokenizer

In [None]:
import spacy
import string
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm")

stop_words_set = set(stopwords.words('english'))
my_custom_stops = {",", "the", "aromas", "\n", "palate", "it", "that", "this", "some", " ", "and", "a", "notes", "rind", "way", "bottling", "you", "typical", "finish", "case", "mouth", "food", "an", "flavor", "its", "touch", "wine", '(', '"', ')', '$', "]", "{", "}", "[", "-", "_", "&", "\s", "em", "s", "”", "—" }
stop_words_set.update(my_custom_stops)
stop_words_set.update(string.punctuation)

def tokenize_with_phrases(text):
  text = text.lower()
  doc = nlp(text)
  results = set()

  # noun_chunk is a combination of adjective and noun for example "tropical friut"
  for chunk in doc.noun_chunks:
    # We change "tropical fruit" into "tropical_fruit" at the same time we want to rember the word "fruit"
    words = chunk.text.split()
    tokens = [token for token in words if token not in my_custom_stops]
    if(len(tokens) == 0):
      continue

    for token in chunk:
      if token.pos_ in ["NOUN", "PROPN"] and token.text not in my_custom_stops:
        results.add(token.text)

    # there is problem where a token is ""
    results.add("_".join(tokens))
  display(results)
  return results

text_example =  """
Aromas include tropical fruit, broom, brimstone and dried herb.
The palate isn't overly expressive, offering unripened apple,
citrus and dried sage alongside brisk acidity.
"""
tokens = tokenize_with_phrases(text_example)


  my_custom_stops = {"the", "aromas", "\n", "palate", "it", "that", "this", "some", " ", "and", "a", "notes", "rind", "way", "bottling", "you", "typical", "finish", "case", "mouth", "food", "an", "flavor", "its", "touch", "wine", '(', '"', ')', '$', "]", "{", "}", "[", "-", "_", "&", "\s", "em", "s", "”", "—" }


{'acidity',
 'apple',
 'brimstone',
 'brisk_acidity',
 'broom',
 'citrus',
 'dried_herb',
 'dried_sage',
 'fruit',
 'herb',
 'sage',
 'tropical_fruit',
 'unripened_apple'}

In [None]:
n = 10
df_description = df['description'].head(n)
display(df_description)
df_description['informative_set'] = [tokenize_with_phrases(description) for description in df_description]
display(df_description)

Unnamed: 0,description
0,"Aromas include tropical fruit, broom, brimston..."
1,"This is ripe and fruity, a wine that is smooth..."
2,"Tart and snappy, the flavors of lime flesh and..."
3,"Pineapple rind, lemon pith and orange blossom ..."
4,"Much like the regular bottling from 2012, this..."
5,Blackberry and raspberry aromas show a typical...
6,"Here's a bright, informal red that opens with ..."
7,This dry and restrained wine offers spice in p...
8,Savory dried thyme notes accent sunnier flavor...
9,This has great depth of flavor with its fresh ...


{'acidity',
 'apple',
 'brimstone',
 'brisk_acidity',
 'broom',
 'citrus',
 'dried_herb',
 'dried_sage',
 'fruit',
 'herb',
 'sage',
 'tropical_fruit',
 'unripened_apple'}

{'acidity',
 'berry',
 'firm_tannins',
 'fruits',
 'juicy_red_berry_fruits',
 'tannins'}

{'acidity',
 'crisp_acidity',
 'dominate',
 'flavors',
 'flesh',
 'green_pineapple',
 'lime',
 'lime_flesh',
 'pineapple',
 'tart_snappy,_flavors',
 'wine'}

{'blossom',
 'guava',
 'honey',
 'honey-drizzled_guava',
 'lemon',
 'lemon_pith',
 'mango',
 'orange',
 'orange_blossom',
 'pineapple',
 'pith',
 'slightly_astringent,_semidry'}

{'characteristics',
 'companion',
 'country',
 'good_companion',
 'hearty_winter_stew',
 'pleasantly_unfussy_country_wine',
 'regular',
 'rustic,_earthy,_herbal_characteristics',
 'stew',
 'wine',
 'winter'}

{'acidity',
 'blackberry',
 'dark_plum_fruit',
 'flavors',
 'fruit',
 'green_herbs',
 'herbal_flavors',
 'herbs',
 'navarran_whiff',
 'plum',
 'spicy',
 'tomatoey',
 'tomatoey_acidity',
 'whiff'}

{'acidity',
 'berry',
 'bright,_informal_red',
 'candied_berry',
 'fresh_acidity',
 'herb',
 'pepper',
 'red',
 'savory_herb',
 'soft_tannins',
 'tannins',
 'white_pepper'}

{'acidity',
 'dry_restrained_wine',
 'firm_texture',
 'profusion',
 'spice',
 'texture',
 'wine'}

{',_off-dry_wine',
 'accent_sunnier_flavors',
 'elegant,_sprightly_footprint',
 'flavors',
 'footprint',
 'fruity',
 'peach',
 'preserved_peach',
 'sunnier',
 'wine'}

{'acidity',
 'apple',
 'crisp_texture',
 'depth',
 'fresh_apple',
 'fruits',
 'great_depth',
 'spice',
 'texture'}

Unnamed: 0,description
0,"Aromas include tropical fruit, broom, brimston..."
1,"This is ripe and fruity, a wine that is smooth..."
2,"Tart and snappy, the flavors of lime flesh and..."
3,"Pineapple rind, lemon pith and orange blossom ..."
4,"Much like the regular bottling from 2012, this..."
5,Blackberry and raspberry aromas show a typical...
6,"Here's a bright, informal red that opens with ..."
7,This dry and restrained wine offers spice in p...
8,Savory dried thyme notes accent sunnier flavor...
9,This has great depth of flavor with its fresh ...


Combine all the descriptions into one large text and then tokenize it.

In [None]:
def tokenize_with_phrases_large_text(text, text_chunk_size=100000):
  text = text.lower()
  text_chunks = [text[i:i+text_chunk_size] for i in range(0, len(text), text_chunk_size)]
  docs = nlp.pipe(text_chunks, batch_size=50, n_process=1)
  results = set()
  for doc in docs:
    # noun_chunk is a combination of adjective and noun for example "tropical friut"
    for chunk in doc.noun_chunks:
      # We change "tropical fruit" into "tropical_fruit" at the same time we want to rember the word "fruit"
      words = chunk.text.split()
      tokens = [token for token in words if token not in my_custom_stops]
      if(len(tokens) == 0):
        continue

      for token in chunk:
        if token.pos_ in ["NOUN", "PROPN"] and token.text not in my_custom_stops:
          results.add(token.text)

      results.add("_".join(tokens))
    display(results)
  return results

large_text = " ".join(df['description'].values.flatten())
tokens = tokenize_with_phrases_large_text(large_text)
display(tokens)

In [None]:
display(len(tokens))

NameError: name 'tokens' is not defined

In [None]:
import spacy

texts_generator = df['description']

def process_large_dataset(texts_iterable):
    results = set()
    docs = nlp.pipe(texts_iterable, batch_size=50, n_process=1)

    for doc in docs:

        for chunk in doc.noun_chunks:
            tokens_in_chunk = [
                token.text.lower()
                for token in chunk
                if token.text.lower() not in my_custom_stops
            ]

            if not tokens_in_chunk:
                continue
            results.add("_".join(tokens_in_chunk))

            for token in chunk:
                if token.pos_ in ["NOUN", "PROPN"] and token.text.lower() not in my_custom_stops:
                    results.add(token.text.lower())

    return results


final_tokens = process_large_dataset(texts_generator)

print(f"Znaleziono {len(final_tokens)} unikalnych tokenów/fraz.")

Znaleziono 263725 unikalnych tokenów/fraz.


In [None]:
import pickle

with open("final_tokens.pkl", "wb") as f:
    pickle.dump(final_tokens, f)

In [None]:
path = "wyniki.txt"

with open(path, "w", encoding="utf-8") as f:
    for element in sorted(final_tokens):
        f.write(f"{element}\n")