In [1]:
!pip install gensim
!pip install nltk



In [2]:
import os
import pandas as pd
import numpy as np
import string
# from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
from nltk.tokenize import sent_tokenize

sample_text = "This is a test. Let's see if this works."
print(sent_tokenize(sample_text))


['This is a test.', "Let's see if this works."]


In [5]:
import kagglehub

# Download dataset
path = kagglehub.dataset_download("zynicide/wine-reviews")
print("Path to dataset files:", path)

# List files in the dataset directory
print(os.listdir(path))

Downloading from https://www.kaggle.com/api/v1/datasets/download/zynicide/wine-reviews?dataset_version_number=4...


100%|██████████| 50.9M/50.9M [00:00<00:00, 65.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/zynicide/wine-reviews/versions/4
['winemag-data-130k-v2.csv', 'winemag-data-130k-v2.json', 'winemag-data_first150k.csv']


First, import the wine dataset.

In [6]:
# Path to the CSV file
csv_path = os.path.join(path, "winemag-data-130k-v2.csv")

# Load the CSV file
df_wine = pd.read_csv(csv_path)

# Preview the data
print(df_wine.head())
print(df_wine.columns)
print(df_wine.info())
print(df_wine.isnull().sum())

   Unnamed: 0   country                                        description  \
0           0     Italy  Aromas include tropical fruit, broom, brimston...   
1           1  Portugal  This is ripe and fruity, a wine that is smooth...   
2           2        US  Tart and snappy, the flavors of lime flesh and...   
3           3        US  Pineapple rind, lemon pith and orange blossom ...   
4           4        US  Much like the regular bottling from 2012, this...   

                          designation  points  price           province  \
0                        Vulkà Bianco      87    NaN  Sicily & Sardinia   
1                            Avidagos      87   15.0              Douro   
2                                 NaN      87   14.0             Oregon   
3                Reserve Late Harvest      87   13.0           Michigan   
4  Vintner's Reserve Wild Child Block      87   65.0             Oregon   

              region_1           region_2         taster_name  \
0              

Then, the food dataset.

In [7]:
# Download latest version
path2 = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path2)

Downloading from https://www.kaggle.com/api/v1/datasets/download/snap/amazon-fine-food-reviews?dataset_version_number=2...


100%|██████████| 242M/242M [00:05<00:00, 47.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2


In [8]:
# Path to the CSV file
csv_path2 = os.path.join(path2, "Reviews.csv")

# Load the CSV file
df_food = pd.read_csv(csv_path2)

# Preview the data
print(df_food.head())
print(df_food.columns)
print(df_food.info())
print(df_food.isnull().sum())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

### 1. Training our Word Embeddings

First, we need to train a Word2Vec model on all the words in our corpus. We will process our wine and food terms separately - some of the wine terms will be standardized to account for commonalities in the colorful language of the world of wine.

In [9]:
wine_reviews_list = list(df_wine['description'])
food_reviews_list = list(df_food['Text'])

To begin, we need to tokenize the terms in our corpus (wine and food).

In [10]:
full_wine_reviews_list = [str(r) for r in wine_reviews_list]
full_wine_corpus = ' '.join(full_wine_reviews_list)
wine_sentences_tokenized = sent_tokenize(full_wine_corpus)

full_food_reviews_list = [str(r) for r in food_reviews_list]
full_food_corpus = ' '.join(full_food_reviews_list)
food_sentences_tokenized = sent_tokenize(full_food_corpus)

print(wine_sentences_tokenized[:2])
print(food_sentences_tokenized[:2])

['Aromas include tropical fruit, broom, brimstone and dried herb.', "The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."]
['I have bought several of the Vitality canned dog food products and have found them all to be of good quality.', 'The product looks more like a stew than a processed meat and it smells better.']


Next, the text in each sentence is normalized (tokenize, remove punctuation and remove stopwords).

In [11]:
stop_words = set(stopwords.words('english'))

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

normalized_wine_sentences = []
for s in wine_sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_wine_sentences.append(normalized_text)

normalized_food_sentences = []
for s in food_sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_food_sentences.append(normalized_text)

Not all of the terms we are interested in are single words. Some of the terms are phrases, consisting of two (or more!) words. An example of this might be 'high tannin'. We can use gensim's Phrases feature to extract all the most relevant bi- and tri-grams from our corpus.

We will train a separate trigram model for wine and for food.

In [13]:
# first, take care of the wine trigrams
wine_bigram_model = Phrases(normalized_wine_sentences, min_count=100)
wine_bigrams = [wine_bigram_model[line] for line in normalized_wine_sentences]
wine_trigram_model = Phrases(wine_bigrams, min_count=50)
phrased_wine_sentences = [wine_trigram_model[line] for line in wine_bigrams]
wine_trigram_model.save('wine_trigrams.pkl')


In [14]:
### now, do the same for food
food_bigram_model = Phrases(normalized_food_sentences, min_count=100)
food_bigrams = [food_bigram_model[sent] for sent in normalized_food_sentences]
food_trigram_model = Phrases(food_bigrams, min_count=50)
phrased_food_sentences = [food_trigram_model[sent] for sent in food_bigrams]
food_trigram_model.save('food_trigrams.pkl')

If the trigram model has already been trained, simply retrieve it.

In [15]:
wine_trigram_model = Phraser.load('wine_trigrams.pkl')
food_trigram_model = Phraser.load('food_trigrams.pkl')

Now for the most important part: leveraging existing wine theory, the work of others like Bernard Chen, wine descriptor mappings and the UC Davis wine wheel, the top 5000 most frequent wine terms were reviewed to (i) determine whether they are a descriptor that can be derived by blind tasting, and (ii) whether they are informative (judgments like 'tasty' and 'great' are not considered to be informative). The roughly 1000 descriptors that remain were then mapped onto a normalized descriptor, a category and a class:

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
descriptor_mapping = pd.read_csv('/content/drive/My Drive/Final Project/wine_food_pairing/descriptor_mapping.csv', encoding='latin1').set_index('raw descriptor')

def return_mapped_descriptor(word, mapping):
    if word in list(mapping.index):
        normalized_word = mapping.at[word, 'level_3']
        return normalized_word
    else:
        return word

normalized_wine_sentences = []
for sent in phrased_wine_sentences:
    normalized_wine_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, descriptor_mapping)
        normalized_wine_sentence.append(str(normalized_word))
    normalized_wine_sentences.append(normalized_wine_sentence)

We will go through the same process for food, but without normalizing the nonaroma descriptors.

In [19]:
aroma_descriptor_mapping = descriptor_mapping.loc[descriptor_mapping['type'] == 'aroma']
normalized_food_sentences = []
for sent in phrased_food_sentences:
    normalized_food_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, aroma_descriptor_mapping)
        normalized_food_sentence.append(str(normalized_word))
    normalized_food_sentences.append(normalized_food_sentence)

Now, let's combine the wine dataset with our food dataset so we can train our embeddings. We want to make sure that the food and wine embeddings are calculated in the same feature space so that we can compute similarity vectors later on.

In [20]:
normalized_sentences = normalized_wine_sentences + normalized_food_sentences

We are ready to train our Word2Vec model!

In [None]:
wine_word2vec_model = Word2Vec(
    sentences=normalized_sentences,
    vector_size=300,  # Use vector_size instead of size
    min_count=8,
    epochs=15  # Use epochs instead of iter
)

print(wine_word2vec_model)

# Save the model
wine_word2vec_model.save('/content/drive/My Drive/Final Project/wine_food_pairing/food_word2vec_model.bin')
wine_word2vec_model.save('food_word2vec_model.bin')

In [40]:
wine_word2vec_model.save('food_word2vec_model.bin')

In [41]:
# if the word2vec model has already been trained, simply load it
wine_word2vec_model = Word2Vec.load("food_word2vec_model.bin")

### 2. Preprocessing our Wine Dataset

We can now turn our attention to our wine dataset. Descriptions for a single wine are unlikely to contain sufficient information about all the nonaromas and aromas to yield consistent and reliable pairing recommendations. As such, we will produce recommendations at the grape variety & subregion level.

First, let's normalize the names of the grape varieties in our dataset.

In [42]:
# Display the unique values in the 'variety' column
unique_varieties = df_wine['variety'].unique()
print(f"Unique grape varieties ({len(unique_varieties)} total):")
print(unique_varieties)

Unique grape varieties (708 total):
['White Blend' 'Portuguese Red' 'Pinot Gris' 'Riesling' 'Pinot Noir'
 'Tempranillo-Merlot' 'Frappato' 'Gewürztraminer' 'Cabernet Sauvignon'
 'Nerello Mascalese' 'Chardonnay' 'Malbec' 'Tempranillo Blend' 'Meritage'
 'Red Blend' 'Merlot' "Nero d'Avola" 'Chenin Blanc' 'Gamay'
 'Sauvignon Blanc' 'Viognier-Chardonnay' 'Primitivo' 'Catarratto'
 'Inzolia' 'Petit Verdot' 'Monica' 'Bordeaux-style White Blend' 'Grillo'
 'Sangiovese' 'Cabernet Franc' 'Champagne Blend'
 'Bordeaux-style Red Blend' 'Aglianico' 'Petite Sirah' 'Touriga Nacional'
 'Carmenère' 'Albariño' 'Petit Manseng' 'Rosé' 'Zinfandel' 'Vernaccia'
 'Rosato' 'Grüner Veltliner' 'Viognier' 'Vermentino' 'Grenache Blanc'
 'Syrah' 'Nebbiolo' 'Shiraz-Cabernet Sauvignon' 'Pinot Blanc'
 'Alsace white blend' 'Barbera' 'Rhône-style Red Blend' 'Portuguese White'
 'Graciano' 'Tannat-Cabernet' 'Sauvignon' 'Sangiovese Grosso' 'Torrontés'
 'Prugnolo Gentile' 'G-S-M' 'Verdejo' 'Fumé Blanc' 'Furmint'
 'Pinot Bianco'

In [43]:
variety_mapping = {
    # Whites
    'Pinot Gris': 'Pinot Grigio',
    'Pinot Grigio/Gris': 'Pinot Grigio',
    'Grüner Veltliner': 'Gruner Veltliner',
    'Fumé Blanc': 'Sauvignon Blanc',
    'Garganega': 'Soave',
    'Verdejo-Viura': 'Verdejo',
    'Riesling-Chardonnay': 'White Blend',
    'Sauvignon Blanc-Semillon': 'White Bordeaux Blend',
    'Semillon-Sauvignon Blanc': 'White Bordeaux Blend',
    'Trebbiano Spoletino': 'Trebbiano',
    'Trebbiano di Lugana': 'Trebbiano',
    'Malvasia Bianca': 'Malvasia',
    'Verdelho': 'Verdelho',
    'Picpoul': 'Piquepoul',
    'Alvarinho': 'Albarino',
    'Verdicchio': 'Verdicchio',
    'Marsanne-Roussanne': 'Rhone White Blend',
    'Chardonnay-Sauvignon Blanc': 'White Blend',
    'Sauvignon Blanc-Chenin Blanc': 'White Blend',
    'Chenin Blanc-Chardonnay': 'White Blend',
    'Viognier-Chardonnay': 'White Blend',
    'Grenache Blanc': 'Rhone White Blend',
    'Assyrtiko': 'Assyrtiko',
    'Müller-Thurgau': 'Muller-Thurgau',
    'Sylvaner': 'Silvaner',
    'Zibibbo': 'Muscat of Alexandria',
    'Muscat Blanc à Petits Grains': 'Muscat',
    'Prosecco': 'Glera',
    'Pinot Bianco': 'Pinot Blanc',
    'Sémillon': 'Semillon',

    # Reds
    'Shiraz': 'Syrah',
    'Syrah-Grenache': 'Rhone Red Blend',
    'Grenache-Syrah': 'Rhone Red Blend',
    'Garnacha': 'Grenache',
    'Cabernet Sauvignon-Merlot': 'Bordeaux Blend',
    'Merlot-Cabernet Sauvignon': 'Bordeaux Blend',
    'Petit Verdot': 'Petit Verdot',
    'Tempranillo-Cabernet Sauvignon': 'Tempranillo Blend',
    'Malbec-Cabernet Franc': 'Bordeaux Blend',
    'Tinta del Pais': 'Tempranillo',
    'Tinta Fina': 'Tempranillo',
    'Aragonês': 'Tempranillo',
    'Cabernet Sauvignon-Syrah': 'Cabernet-Syrah Blend',
    'Cabernet Sauvignon-Carmenère': 'Cabernet-Carmenere Blend',
    'Monastrell': 'Mourvedre',
    'Zinfandel': 'Primitivo',
    'Blaufränkisch': 'Blaufrankisch',
    'Pinot Nero': 'Pinot Noir',
    'Spätburgunder': 'Pinot Noir',
    'Ribolla Gialla': 'Ribolla Gialla',
    'Frappato': 'Frappato',
    'Nero d\'Avola': 'Nero d\'Avola',
    'Aglianico': 'Aglianico',
    'Barbera-Nebbiolo': 'Barbera',
    'Cesanese d\'Affile': 'Cesanese',
    'Lagrein': 'Lagrein',

    # Sparkling
    'Champagne Blend': 'Champagne',
    'Sparkling Blend': 'Sparkling Wine',
    'Portuguese Sparkling': 'Sparkling Wine',

    # Rosés
    'Rosé': 'Rose',
    'Rosado': 'Rose',
    'Portuguese Rosé': 'Rose',

    # Fortified and Sweet Wines
    'Sherry': 'Sherry',
    'Port': 'Port',
    'Madeira Blend': 'Madeira',
    'Pedro Ximénez': 'Pedro Ximenez',
    'Moscatel de Alejandría': 'Muscat of Alexandria',
    'Tokaji': 'Tokaji',

    # Others and Rare Varieties
    'Roussanne': 'Rhone White Blend',
    'Marsanne': 'Rhone White Blend',
    'Carmenère': 'Carmenere',
    'Albariño': 'Albarino',
    'Gewürztraminer': 'Gewurztraminer',
    'Vermentino': 'Vermentino',
    'Viognier': 'Viognier',
    'Cortese': 'Cortese (Gavi)',
    'Nerello Mascalese': 'Nerello Mascalese',
    'Dolcetto': 'Dolcetto',
    'Cinsault': 'Cinsault',
    'Carignan': 'Carignan',
    'Savagnin': 'Savagnin',
    'Tannat': 'Tannat',
    'Malbec': 'Malbec',
    'Petit Manseng': 'Petit Manseng',
    'Grenache': 'Grenache',
    'Pinotage': 'Pinotage',
    'Negroamaro': 'Negroamaro',
    'Falanghina': 'Falanghina',
    'Vernaccia': 'Vernaccia',
    'Primitivo': 'Zinfandel',
    'Cabernet Franc': 'Cabernet Franc',
    'Cabernet Sauvignon': 'Cabernet Sauvignon',
    'Chardonnay': 'Chardonnay',
    'Merlot': 'Merlot',
    'Sangiovese': 'Sangiovese',
    'Nebbiolo': 'Nebbiolo',
    'Gamay': 'Gamay',
}

def consolidate_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name


df_wine_clean = df_wine.copy()
df_wine_clean['variety'] = df_wine_clean['variety'].apply(lambda x: variety_mapping.get(x, x))
df_wine_clean.rename(columns={'region_1': 'Subregion', 'region_2': 'Region'}, inplace=True)

Next, we need to define the set of geography subregions we will use to define our wines. Not too general, not too specific... just right.

In [44]:
# Check the updated column names
print(df_wine_clean.columns)

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'Subregion', 'Region', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')


In [45]:
order_of_geographies = ['Subregion', 'Region', 'province', 'country']

# Replace NaN and invalid values with 'none'
def replace_nan_for_zero(value):
    if str(value).lower() in ['0', 'nan', 'none']:
        return 'none'
    else:
        return value

for o in order_of_geographies:
    df_wine_clean[o] = df_wine_clean[o].apply(replace_nan_for_zero)

# Verify there are no NaN values
print(df_wine_clean[order_of_geographies].isnull().sum())

Subregion    0
Region       0
province     0
country      0
dtype: int64


In [46]:
print(df_wine_clean.columns)

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'Subregion', 'Region', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')


In [47]:
df_wine_clean['geo_normalized'] = (
    df_wine_clean['Subregion'] + ', ' +
    df_wine_clean['Region'] + ', ' +
    df_wine_clean['province'] + ', ' +
    df_wine_clean['country']
)

In [48]:
# Group by 'Variety', 'Country', 'Province', 'Region', and 'Subregion', and count occurrences
variety_geo = df_wine_clean.groupby(['variety', 'country', 'province', 'Region', 'Subregion']).size().reset_index(name='count')

# Filter for groups where count > 1
variety_geo_sliced = variety_geo.loc[variety_geo['count'] > 1]

# Create a new DataFrame with the relevant columns
vgeos_df = pd.DataFrame(variety_geo_sliced, columns=['variety', 'country', 'province', 'Region', 'Subregion', 'count'])

# Save to CSV
vgeos_df.to_csv('varieties_all_geos.csv', index=False)

# Preview the result
print(vgeos_df.head())

       variety country          province Region            Subregion  count
0     Abouriou  France  Southwest France   none  Côtes du Marmandais      2
4  Agiorgitiko  Greece           Corinth   none                 none      2
5  Agiorgitiko  Greece             Nemea   none                 none     51
7  Agiorgitiko  Greece       Peloponnese   none                 none      7
8    Aglianico   Italy       Italy Other   none        Vino Spumante      2


In [49]:
import os
print(os.getcwd())

/content


In [50]:
print(os.listdir())

['.config', 'drive', 'food_trigrams.pkl', 'food_word2vec_model.bin.syn1neg.npy', 'food_word2vec_model.bin', 'food_word2vec_model.bin.wv.vectors.npy', 'wine_trigrams.pkl', 'varieties_all_geos.csv', 'sample_data']


In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
# Define the path to the wine_food_pairing folder
file_path = '/content/drive/My Drive/Final Project/wine_food_pairing/varieties_all_geos.csv'

# Save the file to the specified folder
vgeos_df.to_csv(file_path, index=False)

print(f"File saved to: {file_path}")

File saved to: /content/drive/My Drive/Final Project/wine_food_pairing/varieties_all_geos.csv


In [53]:
df_wine_merged = pd.merge(
    left=df_wine_clean,
    right=vgeos_df,
    left_on=['variety', 'country', 'province', 'Region', 'Subregion'],
    right_on=['variety', 'country', 'province', 'Region', 'Subregion']
)

# Drop unnecessary columns
columns_to_drop = [
    'Unnamed: 0', 'designation', 'price', 'province', 'Region',
    'Subregion', 'taster_name', 'taster_twitter_handle', 'winery', 'count'
]
df_wine_merged.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Verify the resulting shape
print("Merged DataFrame shape:", df_wine_merged.shape)

Merged DataFrame shape: (126612, 6)


We only want to keep wine types (location + variety) that appear frequently enough in our dataset.

In [54]:
print(df_wine_merged.columns)

Index(['country', 'description', 'points', 'title', 'variety',
       'geo_normalized'],
      dtype='object')


In [55]:
variety_geos = df_wine_merged.groupby(['variety', 'geo_normalized']).size()
at_least_n_types = variety_geos[variety_geos > 30].reset_index()
df_wine_merged_filtered = pd.merge(df_wine_merged, at_least_n_types, left_on=['variety', 'geo_normalized'], right_on=['variety', 'geo_normalized'])
df_wine_merged_filtered = df_wine_merged_filtered[['title', 'variety', 'geo_normalized', 'description']]
print(df_wine_merged_filtered.shape)

(92514, 4)


Now, we will extract 7 vectors for every wine:

- aroma vector (the aggregate of all the aroma descriptors in a wine)
- nonaroma vectors (an aggregate vector for only aroma & non-aroma descriptors matching the core tastes below):
    - sweetness
    - acid
    - salt
    - piquant
    - fat
    - bitter
    
 In our descriptor file, we have defined which normalized descriptors pertain to each nonaroma.

In [56]:
file_path2 = '/content/drive/My Drive/Final Project/wine_food_pairing/descriptor_mapping_tastes.csv'

In [57]:
wine_reviews = list(df_wine_merged_filtered['description'])

descriptor_mapping = pd.read_csv(file_path2, encoding='latin1').set_index('raw descriptor')

core_tastes = ['aroma', 'weight', 'sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']
descriptor_mappings = dict()
for c in core_tastes:
    if c=='aroma':
        descriptor_mapping_filtered=descriptor_mapping.loc[descriptor_mapping['type']=='aroma']
    else:
        descriptor_mapping_filtered=descriptor_mapping.loc[descriptor_mapping['primary taste']==c]
    descriptor_mappings[c] = descriptor_mapping_filtered


def return_descriptor_from_mapping(descriptor_mapping, word, core_taste):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['combined'][word]
        return descriptor_to_return
    else:
        return None

review_descriptors = []
for review in wine_reviews:
    taste_descriptors = []
    normalized_review = normalize_text(review)
    phrased_review = wine_trigram_model[normalized_review]
#     print(phrased_review)

    for c in core_tastes:
        descriptors_only = [return_descriptor_from_mapping(descriptor_mappings[c], word, c) for word in phrased_review]
        no_nones = [str(d).strip() for d in descriptors_only if d is not None]
        descriptorized_review = ' '.join(no_nones)
        taste_descriptors.append(descriptorized_review)
    review_descriptors.append(taste_descriptors)


Now we will take the list of descriptors for each wine and its aroma/nonaroma vectors and compute a TF-IDF weighted embedding for each. We will store the results in a dataframe.

In [58]:
taste_descriptors = []
taste_vectors = []

for n, taste in enumerate(core_tastes):
    print(taste)
    taste_words = [r[n] for r in review_descriptors]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(taste_words)
    dict_of_tfidf_weightings = dict(zip(X.get_feature_names_out(), X.idf_))

    wine_review_descriptors = []
    wine_review_vectors = []

    for d in taste_words:
        descriptor_count = 0
        weighted_review_terms = []
        terms = d.split(' ')
        for term in terms:
            if term in dict_of_tfidf_weightings.keys():
                tfidf_weighting = dict_of_tfidf_weightings[term]
                try:
                    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
                    weighted_word_vector = tfidf_weighting * word_vector
                    weighted_review_terms.append(weighted_word_vector)
                    descriptor_count += 1
                except:
                    continue
            else:
                continue
        try:
            review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
            review_vector = review_vector[0]
        except:
            review_vector = np.nan
#         terms_and_vec = [terms, review_vector]
        wine_review_vectors.append(review_vector)
        wine_review_descriptors.append(terms)

    taste_vectors.append(wine_review_vectors)
    taste_descriptors.append(wine_review_descriptors)


taste_vectors_t = list(map(list, zip(*taste_vectors)))
taste_descriptors_t = list(map(list, zip(*taste_descriptors)))

df_review_vecs = pd.DataFrame(taste_vectors_t, columns=core_tastes)

columns_taste_descriptors = [a + '_descriptors' for a in core_tastes]
df_review_descriptors = pd.DataFrame(taste_descriptors_t, columns=columns_taste_descriptors)

df_wine_vecs = pd.concat([df_wine_merged_filtered, df_review_descriptors, df_review_vecs], axis=1)
df_wine_vecs.head(5)

aroma
weight
sweet
acid
salt
piquant
fat
bitter


Unnamed: 0,title,variety,geo_normalized,description,aroma_descriptors,weight_descriptors,sweet_descriptors,acid_descriptors,salt_descriptors,piquant_descriptors,fat_descriptors,bitter_descriptors,aroma,weight,sweet,acid,salt,piquant,fat,bitter
0,Nicosia 2013 Vulkà Bianco (Etna),White Blend,"Etna, none, Sicily & Sardinia, Italy","Aromas include tropical fruit, broom, brimston...","[tropical_fruit, fruit, herb, apple, citrus, s...",[],"[dry, dry]",[high_acid],[],[],[],[],"[1.5314503, -1.1948165, 1.6763849, -2.3928611,...",,"[-1.0736573, -5.8961477, 2.4550738, -2.3947525...","[1.18786, 0.2316838, 0.5854382, -0.867404, 0.8...",,,,
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,"none, none, Douro, Portugal","This is ripe and fruity, a wine that is smooth...","[ripe, fruit, juicy, berry, fruit]",[],[],[high_acid],[],[],[],"[low_tannin, high_tannin]","[-1.0531403, 0.039513946, 3.635075, -5.4161215...",,,"[1.18786, 0.2316838, 0.5854382, -0.867404, 0.8...",,,,
2,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Grigio,"Willamette Valley, Willamette Valley, Oregon, US","Tart and snappy, the flavors of lime flesh and...","[lime, rind, pineapple, steel]",[],[],"[high_acid, high_acid, high_acid]",[],[],[],[],"[3.389069, -2.2388694, 2.683044, -0.59714675, ...",,,"[1.18786, 0.2316838, 0.5854382, -0.867404, 0.8...",,,,
3,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,"Willamette Valley, Willamette Valley, Oregon, US","Much like the regular bottling from 2012, this...","[earth, herb]",[full_bodied],[],[],[],[],[],"[high_tannin, high_tannin, high_tannin]","[3.1985364, 0.6735065, 3.2442057, -3.170793, 0...","[0.58595484, 2.0585175, 1.2196677, -4.080289, ...",,,,,,
4,Trimbach 2012 Gewurztraminer (Alsace),Gewurztraminer,"Alsace, none, Alsace, France",This dry and restrained wine offers spice in p...,[spice],[],[dry],[],[],[],[],[high_tannin],"[2.3782816, -2.6584692, 0.051082402, -0.109282...",,"[-1.0736573, -5.8961477, 2.4550738, -2.3947525...",,,,,


If we don't have a nonaroma embedding for one of the wines, we will simply take the average nonaroma embedding for all the wines in the dataset.

In [59]:
# pull the average embedding for the wine attribute across all wines.
avg_taste_vecs = dict()
for t in core_tastes:
    # look at the average embedding for a taste, across all wines that have descriptors for that taste
    review_arrays = df_wine_vecs[t].dropna()
    average_taste_vec = np.average(review_arrays)
    avg_taste_vecs[t] = average_taste_vec

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Now, let's find the average embedding for each type of wine (aromas and all nonaromas). We have defined the different types of wines by grape variety and geography, keeping only those with a sufficiently large sample size.

For each variety, we will pull (i) a 300-dimensional aroma vector, and (ii) 7 non-aroma scalars.

In [60]:
print(df_wine_vecs.columns)

Index(['title', 'variety', 'geo_normalized', 'description',
       'aroma_descriptors', 'weight_descriptors', 'sweet_descriptors',
       'acid_descriptors', 'salt_descriptors', 'piquant_descriptors',
       'fat_descriptors', 'bitter_descriptors', 'aroma', 'weight', 'sweet',
       'acid', 'salt', 'piquant', 'fat', 'bitter'],
      dtype='object')


In [61]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
from collections import Counter

# Reset NaN values in taste vectors to ensure clean data processing
core_tastes = ['aroma', 'weight', 'sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']

# Function to replace NaN values with average or fallback vectors
def replace_nan_with_avg(df, core_tastes, avg_vecs):
    for taste in core_tastes:
        valid_vectors = [x for x in df[taste] if isinstance(x, np.ndarray)]
        if valid_vectors:
            avg_vecs[taste] = np.mean(valid_vectors, axis=0)  # Calculate average
        else:
            avg_vecs[taste] = np.zeros(300)  # Fallback to zeros if no valid vectors

        df[taste] = df[taste].apply(lambda x: avg_vecs[taste] if not isinstance(x, np.ndarray) else x)
    return df

# Subset wine vectors for PCA
def subset_wine_vectors(list_of_varieties, wine_attribute):
    wine_variety_vectors = []
    for v in list_of_varieties:
        subset = df_wine_vecs[
            (df_wine_vecs['variety'] == v[0]) & (df_wine_vecs['geo_normalized'] == v[1])
        ]
        if subset.empty:
            continue

        taste_vecs = subset[wine_attribute].tolist()
        taste_vecs = [
            avg_taste_vecs[wine_attribute] if not isinstance(x, np.ndarray) else x for x in taste_vecs
        ]
        average_variety_vec = np.mean(taste_vecs, axis=0)

        descriptor_colname = wine_attribute + '_descriptors'
        all_descriptors = [
            descriptor for descriptors_list in subset[descriptor_colname]
            for descriptor in descriptors_list
        ]
        word_freqs = Counter(all_descriptors)
        most_common_words = word_freqs.most_common(50)
        top_n_words = [(i[0], "{:.2f}".format(i[1] / len(taste_vecs))) for i in most_common_words]
        wine_variety_vector = [v, average_variety_vec, top_n_words]

        wine_variety_vectors.append(wine_variety_vector)

    return wine_variety_vectors

# PCA processing for wine varieties
def pca_wine_variety(list_of_varieties, wine_attribute, pca=True):
    wine_var_vectors = subset_wine_vectors(list_of_varieties, wine_attribute)

    # Ensure valid data before applying PCA
    wine_varieties = [w[0] for w in wine_var_vectors]
    wine_var_vec = np.array([w[1] for w in wine_var_vectors])

    if len(wine_var_vec) == 0 or len(wine_var_vec.shape) < 2:
        print(f"No valid data for {wine_attribute}. Skipping...")
        return pd.DataFrame(), pd.DataFrame()

    if pca:
        pca_model = PCA(n_components=1)
        wine_var_vec = pca_model.fit_transform(wine_var_vec)
        wine_var_vec = pd.DataFrame(wine_var_vec, index=wine_varieties)
    else:
        wine_var_vec = pd.DataFrame(wine_var_vec, index=wine_varieties)

    wine_descriptors = pd.DataFrame([w[2] for w in wine_var_vectors], index=wine_varieties)
    return wine_var_vec, wine_descriptors

# Initialize variables
avg_taste_vecs = {}
normalized_geos = list(set(zip(df_wine_vecs['variety'], df_wine_vecs['geo_normalized'])))

# Replace NaN values with averages
df_wine_vecs = replace_nan_with_avg(df_wine_vecs, core_tastes, avg_taste_vecs)

# Generate taste vectors and descriptors
taste_dataframes = []
aroma_vec, aroma_descriptors = pca_wine_variety(normalized_geos, 'aroma', pca=False)
taste_dataframes.append(aroma_vec)

for taste in core_tastes[1:]:
    print(f"Processing taste: {taste}")
    pca_df, descriptors_df = pca_wine_variety(normalized_geos, taste, pca=True)
    taste_dataframes.append(pca_df)

# Combine taste dataframes
all_taste_data = pd.concat(taste_dataframes, axis=1)

# Rename columns to match core tastes
if all_taste_data.shape[1] == len(core_tastes):
    all_taste_data.columns = core_tastes
else:
    print(f"Column mismatch: Expected {len(core_tastes)} but got {all_taste_data.shape[1]}.")

# Final output
print(all_taste_data.head())

Processing taste: weight
Processing taste: sweet
Processing taste: acid
Processing taste: salt


  self.explained_variance_ratio_ = self.explained_variance_ / total_var


Processing taste: piquant
Processing taste: fat
Processing taste: bitter
Column mismatch: Expected 8 but got 307.
                                                         0         1    \
(Malbec, Valle de Uco, none, Mendoza Province, ...  0.651970  0.341475   
(Glera, Prosecco Superiore di Cartizze, none, V...  3.206893 -2.079213   
(Red Blend, Valencia, none, Levante, Spain)         0.763971  0.447492   
(Rose, Navarra, none, Northern Spain, Spain)        3.036833 -1.838407   
(Sauvignon Blanc, Lake County, none, California...  3.398164 -2.984707   

                                                         2         3    \
(Malbec, Valle de Uco, none, Mendoza Province, ...  4.106539 -4.985038   
(Glera, Prosecco Superiore di Cartizze, none, V...  1.795597 -3.691960   
(Red Blend, Valencia, none, Levante, Spain)         3.892898 -4.701584   
(Rose, Navarra, none, Northern Spain, Spain)        2.928916 -5.142679   
(Sauvignon Blanc, Lake County, none, California...  2.494531 -2.882778 

In [62]:
normalized_geos = list(set(zip(df_wine_vecs['variety'], df_wine_vecs['geo_normalized'])))

def subset_wine_vectors(list_of_varieties, wine_attribute):
    wine_variety_vectors = []
    for v in list_of_varieties:

        one_var_only = df_wine_vecs.loc[(df_wine_vecs['variety'] == v[0]) &
                                                (df_wine_vecs['geo_normalized'] == v[1])]
        if len(list(one_var_only.index)) < 1 or str(v[1][-1]) == '0':
            continue
        else:
            taste_vecs = list(one_var_only[wine_attribute])
            taste_vecs = [avg_taste_vecs[wine_attribute] if 'numpy' not in str(type(x)) else x for x in taste_vecs]
            average_variety_vec = np.average(taste_vecs, axis=0)

            descriptor_colname = wine_attribute + '_descriptors'
            all_descriptors = [i[0] for i in list(one_var_only[descriptor_colname])]
            word_freqs = Counter(all_descriptors)
            most_common_words = word_freqs.most_common(50)
            top_n_words = [(i[0], "{:.2f}".format(i[1]/len(taste_vecs))) for i in most_common_words]
            top_n_words = [i for i in top_n_words if len(i[0])>2]
            wine_variety_vector = [v, average_variety_vec, top_n_words]

            wine_variety_vectors.append(wine_variety_vector)

    return wine_variety_vectors


def pca_wine_variety(list_of_varieties, wine_attribute, pca=True):
    wine_var_vectors = subset_wine_vectors(normalized_geos, wine_attribute)

    wine_varieties = [str(w[0]).replace('(', '').replace(')', '').replace("'", '').replace('"', '') for w in wine_var_vectors]
    wine_var_vec = [w[1] for w in wine_var_vectors]
    if pca:
        pca = PCA(1)
        wine_var_vec = pca.fit_transform(wine_var_vec)
        wine_var_vec = pd.DataFrame(wine_var_vec, index=wine_varieties)
    else:
        wine_var_vec = pd.Series(wine_var_vec, index=wine_varieties)
    wine_var_vec.sort_index(inplace=True)

    wine_descriptors = pd.DataFrame([w[2] for w in wine_var_vectors], index=wine_varieties)
    wine_descriptors = pd.melt(wine_descriptors.reset_index(), id_vars='index')
    wine_descriptors.sort_index(inplace=True)

    return wine_var_vec, wine_descriptors

taste_dataframes = []
# generate the dataframe of aromas vectors as output,
aroma_vec, aroma_descriptors = pca_wine_variety(normalized_geos, 'aroma', pca=False)
taste_dataframes.append(aroma_vec)

# generate the dataframes of nonaroma scalars
for tw in core_tastes[1:]:
    pca_w_dataframe, nonaroma_descriptors = pca_wine_variety(normalized_geos, tw, pca=True)
    taste_dataframes.append(pca_w_dataframe)

# combine all the dataframes created above into one
all_nonaromas = pd.concat(taste_dataframes, axis=1)
all_nonaromas.columns = core_tastes

  self.explained_variance_ratio_ = self.explained_variance_ / total_var
  self.explained_variance_ratio_ = self.explained_variance_ / total_var
  self.explained_variance_ratio_ = self.explained_variance_ / total_var


In [63]:
# save the 50 top descriptors for each wine variety as a CSV file. We will us this later to dig deeper into our proposed wine recommendations.

aroma_descriptors_copy = aroma_descriptors.copy()
aroma_descriptors_copy.set_index('index', inplace=True)
aroma_descriptors_copy.dropna(inplace=True)

aroma_descriptors_copy = pd.DataFrame(aroma_descriptors_copy['value'].tolist(), index=aroma_descriptors_copy.index)
aroma_descriptors_copy.columns = ['descriptors', 'relative_frequency']
aroma_descriptors_copy.to_csv('wine_variety_descriptors.csv')

In [64]:
aroma_descriptors_copy.head()

Unnamed: 0_level_0,descriptors,relative_frequency
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"Malbec, Valle de Uco, none, Mendoza Province, Argentina",ripe,0.17
"Glera, Prosecco Superiore di Cartizze, none, Veneto, Italy",fruit,0.16
"Red Blend, Valencia, none, Levante, Spain",earth,0.13
"Rose, Navarra, none, Northern Spain, Spain",citrus,0.08
"Sauvignon Blanc, Lake County, none, California, US",citrus,0.11


At the moment, it's hard to interpret the nonaroma scalars. To allow for greater interpretability, we will normalize the nonaroma scalars between 0 and 1.

In [65]:
all_nonaromas['bitter'].unique()

array([0.])

In [66]:
def normalize(df, cols_to_normalize):
    for feature_name in cols_to_normalize:
        print(f"Normalizing: {feature_name}")
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()

        # Handle constant columns to avoid division by zero
        if max_value == min_value:
            print(f"Column '{feature_name}' has constant values. Assigning 0 to all entries.")
            df[feature_name] = 0  # Assign 0 to all rows in constant columns
        else:
            # Apply normalization
            df[feature_name] = df[feature_name].apply(lambda x: (x - min_value) / (max_value - min_value))
    return df

# Normalize non-aroma columns
all_nonaromas_normalized = normalize(all_nonaromas, cols_to_normalize=core_tastes[1:])

# Save the normalized data to a CSV file
output_file = 'wine_aromas_nonaromas.csv'
all_nonaromas_normalized.to_csv(output_file, index=False)
print(f"Normalized data saved to {output_file}")

Normalizing: weight
Normalizing: sweet
Normalizing: acid
Normalizing: salt
Column 'salt' has constant values. Assigning 0 to all entries.
Normalizing: piquant
Normalizing: fat
Normalizing: bitter
Column 'bitter' has constant values. Assigning 0 to all entries.
Normalized data saved to wine_aromas_nonaromas.csv


### 3. Preparing our Food Dataset

Now that we have our wine aroma vectors and the nonaroma scalars, we can turn our attention to food.

We will want to generate nonaroma vectors for any type of food that we want a wine pairing with. For food, we don't have the luxury of being able to define nonaroma vs. aroma descriptors, so the approach we take will be slightly different:

The aroma vector will be the full food embedding.

We will define an embedding for each of our core nonaromas (sweet, acid, salt, piquant, fat and bitter), and the weight/body of the food. We will define the maximum distance between each of the nonaroma embeddings and a range of commonly appearing foods. The foods that least and most resemble each nonaroma will eventually allow us to create a normalized scale between 0 (very dissimilar) and 1 (very similar) to say how much a food reflects each nonaroma.

First, let's load this list of common foods.

In [67]:
file_path3 = '/content/drive/My Drive/Final Project/wine_food_pairing/list_of_foods.csv'

In [68]:
foods = pd.read_csv(file_path3)
foods_list = list(foods['Food'])
foods_list_normalized = [normalize_text(f) for f in foods_list]
foods_list_preprocessed = [food_trigram_model[f][0] for f in foods_list_normalized]
foods_list_preprocessed = list(set(foods_list_preprocessed))

Load the word embedding for each food in the list of sample foods, and save to a dictionary.

In [69]:
foods_vecs = dict()

word_vectors = wine_word2vec_model.wv
for f in foods_list_preprocessed:
    try:
        food_vec = word_vectors[f]
        foods_vecs[f] = food_vec
    except:
        continue

Now, we can define the nonaroma embeddings + the weight embedding as the average of foods that represent each nonaroma characteristic.

In [70]:
from scipy import spatial

core_tastes_revised = {'weight': ['heavy', 'cassoulet', 'cassoulet', 'full_bodied', 'thick', 'milk', 'fat', 'mincemeat', 'steak', 'bold', 'pizza', 'pasta', 'creamy', 'bread'],
                       'sweet': ['sweet', 'sugar', 'cake', 'mango', 'stevia'],
                       'acid': ['acid', 'sour', 'vinegar', 'yoghurt', 'cevich', 'cevich'],
                       'salt': ['salty', 'salty', 'parmesan', 'oyster', 'pizza', 'bacon', 'cured_meat', 'sausage', 'potato_chip'],
                       'piquant': ['spicy'],
                       'fat': ['fat', 'fried', 'creamy', 'cassoulet', 'foie_gras', 'buttery', 'cake', 'foie_gras', 'sausage', 'brie', 'carbonara'],
                       'bitter': ['bitter', 'kale']
                      }

average_taste_vecs = dict()
core_tastes_distances = dict()
for taste, keywords in core_tastes_revised.items():

    all_keyword_vecs = []
    for keyword in keywords:
        c_vec = word_vectors[keyword]
        all_keyword_vecs.append(c_vec)

    avg_taste_vec = np.average(all_keyword_vecs, axis=0)
    average_taste_vecs[taste] = avg_taste_vec

    taste_distances = dict()
    for k, v in foods_vecs.items():
        similarity = 1- spatial.distance.cosine(avg_taste_vec, v)
        taste_distances[k] = similarity

    core_tastes_distances[taste] = taste_distances

We can now find out which foods most and least resemble each nonaroma.

In [71]:
food_nonaroma_infos = dict()
# for each core taste, identify the food item that is farthest and closest. We will need this to create a normalized scale between 0 and 1
for key, value in core_tastes_revised.items():
    dict_taste = dict()
    farthest = min(core_tastes_distances[key], key=core_tastes_distances[key].get)
    farthest_distance = core_tastes_distances[key][farthest]
    closest = max(core_tastes_distances[key], key=core_tastes_distances[key].get)
    closest_distance = core_tastes_distances[key][closest]
    print(key, farthest, closest)
    dict_taste['farthest'] = farthest_distance
    dict_taste['closest'] = closest_distance
    dict_taste['average_vec'] = average_taste_vecs[key]
    food_nonaroma_infos[key] = dict_taste

weight kidney pizza
sweet entre honey
acid hungri tart
salt nectar bacon
piquant sole curri
fat pain foie_gras
bitter biscuit kale


Now, let's save the average embedding for each nonaroma, as well as the minimum and maximum distance to each nonaroma embedding - we will use these to scale the nonaroma scalars that we obtain for any foods we try to pair wine with.

In [72]:
df_food_nonaroma_infos = pd.DataFrame(food_nonaroma_infos).T
df_food_nonaroma_infos.to_csv('average_nonaroma_vectors.csv')

We have all the pieces we need to build our wine recommendations. We will continue with this in a separate notebook.

In [73]:
import os

# Check current working directory
print("Current working directory:", os.getcwd())

# List all files in the current directory
print("Files in the current directory:")
print(os.listdir())

Current working directory: /content
Files in the current directory:
['.config', 'wine_variety_descriptors.csv', 'drive', 'food_trigrams.pkl', 'food_word2vec_model.bin.syn1neg.npy', 'wine_aromas_nonaromas.csv', 'food_word2vec_model.bin', 'average_nonaroma_vectors.csv', 'food_word2vec_model.bin.wv.vectors.npy', 'wine_trigrams.pkl', 'varieties_all_geos.csv', 'sample_data']


In [74]:
from google.colab import drive
drive.mount('/content/drive')

# Check files in your Google Drive folder
print("Files in the Drive folder:")
print(os.listdir('/content/drive/My Drive/Final Project/wine_food_pairing'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files in the Drive folder:
['varieties_all_geos_normalized.csv', 'list_of_foods.csv', 'README.md', 'Wine Food Pairings.ipynb', 'requirements.txt', 'descriptor_mapping_tastes.csv', 'descriptor_mapping.csv', 'archive-8', 'wine_trigrams.pkl', 'food_trigrams.pkl', 'food_word2vec_model.bin.wv.vectors.npy', 'food_word2vec_model.bin.syn1neg.npy', 'food_word2vec_model.bin', 'wine_variety_descriptors.csv', 'wine_aromas_nonaromas.csv', 'average_nonaroma_vectors.csv', 'varieties_all_geos.csv', 'Wine Food Pairing Data Prep.ipynb']


In [75]:
import os
import shutil

# Define source and target directories
source_dir = '/content'
target_dir = '/content/drive/My Drive/Final Project/wine_food_pairing'

# Ensure the target directory exists
os.makedirs(target_dir, exist_ok=True)

# List all files and directories in the source directory
files_to_move = [f for f in os.listdir(source_dir) if f != 'drive']

# Move files to the target directory
for file_name in files_to_move:
    source_path = os.path.join(source_dir, file_name)
    target_path = os.path.join(target_dir, file_name)

    try:
        # Move only if it's a file or directory other than 'drive'
        if os.path.isfile(source_path) or os.path.isdir(source_path):
            shutil.move(source_path, target_path)
            print(f"Moved: {file_name}")
    except Exception as e:
        print(f"Error moving {file_name}: {e}")

print("All files have been moved.")

Moved: .config
Moved: wine_variety_descriptors.csv
Moved: food_trigrams.pkl
Moved: food_word2vec_model.bin.syn1neg.npy
Moved: wine_aromas_nonaromas.csv
Moved: food_word2vec_model.bin
Moved: average_nonaroma_vectors.csv
Moved: food_word2vec_model.bin.wv.vectors.npy
Moved: wine_trigrams.pkl
Moved: varieties_all_geos.csv
Moved: sample_data
All files have been moved.


In [77]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import os

# File paths (update these as needed)
file_path_aromas_nonaromas = '/content/drive/My Drive/Final Project/wine_aromas_nonaromas.csv'
file_path_descriptors = '/content/drive/My Drive/Final Project/wine_variety_descriptors.csv'
file_path_nonaroma_vectors = '/content/drive/My Drive/Final Project/average_nonaroma_vectors.csv'
file_path_word2vec_model = '/content/drive/My Drive/Final Project/food_word2vec_model.bin'

# Step 1: Validate `wine_aromas_nonaromas.csv`
print("=== Validating wine_aromas_nonaromas.csv ===")
try:
    wine_aromas_nonaromas = pd.read_csv(file_path_aromas_nonaromas)
    print(wine_aromas_nonaromas.info())
    print(wine_aromas_nonaromas.head())
    # Check if necessary columns exist
    expected_columns = ['aroma', 'weight', 'sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']
    missing_columns = [col for col in expected_columns if col not in wine_aromas_nonaromas.columns]
    if missing_columns:
        print(f"Missing columns: {missing_columns}")
    else:
        print("All expected columns are present.")
except Exception as e:
    print(f"Error reading wine_aromas_nonaromas.csv: {e}")

# Step 2: Validate `wine_variety_descriptors.csv`
print("\n=== Validating wine_variety_descriptors.csv ===")
try:
    wine_descriptors = pd.read_csv(file_path_descriptors)
    print(wine_descriptors.info())
    print(wine_descriptors.head())
    # Check if necessary columns exist
    expected_columns = ['index', 'descriptors', 'relative_frequency']
    missing_columns = [col for col in expected_columns if col not in wine_descriptors.columns]
    if missing_columns:
        print(f"Missing columns: {missing_columns}")
    else:
        print("All expected columns are present.")
except Exception as e:
    print(f"Error reading wine_variety_descriptors.csv: {e}")

# Step 3: Validate `average_nonaroma_vectors.csv`
print("\n=== Validating average_nonaroma_vectors.csv ===")
try:
    average_nonaroma_vectors = pd.read_csv(file_path_nonaroma_vectors)
    print(average_nonaroma_vectors.info())
    print(average_nonaroma_vectors.head())
    # Check if necessary columns exist
    expected_columns = ['farthest', 'closest', 'average_vec']
    missing_columns = [col for col in expected_columns if col not in average_nonaroma_vectors.columns]
    if missing_columns:
        print(f"Missing columns: {missing_columns}")
    else:
        print("All expected columns are present.")
except Exception as e:
    print(f"Error reading average_nonaroma_vectors.csv: {e}")

# Step 4: Validate Word2Vec Model
print("\n=== Validating Word2Vec Model ===")
try:
    if os.path.exists(file_path_word2vec_model):
        word2vec_model = Word2Vec.load(file_path_word2vec_model)
        print(f"Word2Vec Model Loaded Successfully. Vocabulary Size: {len(word2vec_model.wv.key_to_index)}")
        # Check if specific wine/food terms exist in vocabulary
        test_terms = ['wine', 'food', 'acid', 'sweet']
        for term in test_terms:
            if term in word2vec_model.wv.key_to_index:
                print(f"'{term}' exists in the Word2Vec vocabulary.")
            else:
                print(f"'{term}' does NOT exist in the Word2Vec vocabulary.")
    else:
        print(f"Word2Vec model file not found: {file_path_word2vec_model}")
except Exception as e:
    print(f"Error loading Word2Vec model: {e}")

# Step 5: Validate Indices and Data Compatibility
print("\n=== Validating Indices and Data Compatibility ===")

# Ensure `wine_aromas_nonaromas` and `average_nonaroma_vectors` have compatible indices
try:
    if not wine_aromas_nonaromas.index.equals(average_nonaroma_vectors.index):
        print("Indices between `wine_aromas_nonaromas` and `average_nonaroma_vectors` are NOT aligned.")
    else:
        print("Indices between `wine_aromas_nonaromas` and `average_nonaroma_vectors` are aligned.")
except Exception as e:
    print(f"Error comparing indices: {e}")

# Ensure that descriptor frequencies can match with wine variety indices
try:
    if 'index' in wine_descriptors.columns and 'index' in wine_aromas_nonaromas.columns:
        unmatched_indices = set(wine_descriptors['index']) - set(wine_aromas_nonaromas['index'])
        if unmatched_indices:
            print(f"Unmatched indices between descriptors and wine data: {unmatched_indices}")
        else:
            print("All indices match between descriptors and wine data.")
    else:
        print("Index column missing in one or both files.")
except Exception as e:
    print(f"Error validating indices between files: {e}")

=== Validating wine_aromas_nonaromas.csv ===
Error reading wine_aromas_nonaromas.csv: [Errno 2] No such file or directory: '/content/drive/My Drive/Final Project/wine_aromas_nonaromas.csv'

=== Validating wine_variety_descriptors.csv ===
Error reading wine_variety_descriptors.csv: [Errno 2] No such file or directory: '/content/drive/My Drive/Final Project/wine_variety_descriptors.csv'

=== Validating average_nonaroma_vectors.csv ===
Error reading average_nonaroma_vectors.csv: [Errno 2] No such file or directory: '/content/drive/My Drive/Final Project/average_nonaroma_vectors.csv'

=== Validating Word2Vec Model ===
Word2Vec model file not found: /content/drive/My Drive/Final Project/food_word2vec_model.bin

=== Validating Indices and Data Compatibility ===
Error comparing indices: name 'wine_aromas_nonaromas' is not defined
Error validating indices between files: name 'wine_descriptors' is not defined


In [78]:
import os

# Define the target directory
target_dir = '/content/drive/My Drive/Final Project/wine_food_pairing'

# List all files in the directory
print("Files in the directory:")
print(os.listdir(target_dir))

Files in the directory:
['varieties_all_geos_normalized.csv', 'list_of_foods.csv', 'README.md', 'Wine Food Pairings.ipynb', 'requirements.txt', 'descriptor_mapping_tastes.csv', 'descriptor_mapping.csv', 'archive-8', 'wine_trigrams.pkl', 'food_trigrams.pkl', 'food_word2vec_model.bin.wv.vectors.npy', 'food_word2vec_model.bin.syn1neg.npy', 'food_word2vec_model.bin', 'wine_variety_descriptors.csv', 'wine_aromas_nonaromas.csv', 'average_nonaroma_vectors.csv', 'varieties_all_geos.csv', 'Wine Food Pairing Data Prep.ipynb', '.config', 'sample_data']


In [79]:
# File paths
file_path_aromas_nonaromas = '/content/drive/My Drive/Final Project/wine_food_pairing/wine_aromas_nonaromas.csv'
file_path_descriptors = '/content/drive/My Drive/Final Project/wine_food_pairing/wine_variety_descriptors.csv'
file_path_nonaroma_vectors = '/content/drive/My Drive/Final Project/wine_food_pairing/average_nonaroma_vectors.csv'
file_path_word2vec_model = '/content/drive/My Drive/Final Project/wine_food_pairing/food_word2vec_model.bin'

In [80]:
import pandas as pd

# Load and preview the files
df_aromas_nonaromas = pd.read_csv(file_path_aromas_nonaromas)
print("Aromas/Nonaromas File Preview:")
print(df_aromas_nonaromas.head())

df_descriptors = pd.read_csv(file_path_descriptors)
print("\nWine Descriptors File Preview:")
print(df_descriptors.head())

df_nonaroma_vectors = pd.read_csv(file_path_nonaroma_vectors)
print("\nAverage Nonaroma Vectors File Preview:")
print(df_nonaroma_vectors.head())

Aromas/Nonaromas File Preview:
                                               aroma    weight     sweet  \
0  [ 8.83543372e-01  2.03477836e+00  4.00856638e+...  0.652852  0.704191   
1  [ 6.6942054e-01  1.1554404e+00  4.5183077e+00 ...  0.374007  0.823238   
2  [ 1.5175816   0.6431955   3.5210435  -4.243507...  0.400942  0.628509   
3  [ 9.42802966e-01  6.84352636e-01  3.85812044e+...  0.381809  0.678276   
4  [ 1.0192169e+00  6.6217339e-01  4.5112958e+00 ...  0.445992  0.766660   

       acid  salt   piquant       fat  bitter  
0  0.030032     0  0.556245  0.860050       0  
1  0.038633     0  0.699985  0.861359       0  
2  0.034513     0  0.578870  0.779324       0  
3  0.030032     0  0.801150  0.860050       0  
4  0.041743     0  0.649219  0.861077       0  

Wine Descriptors File Preview:
                                               index descriptors  \
0  Malbec, Valle de Uco, none, Mendoza Province, ...        ripe   
1  Glera, Prosecco Superiore di Cartizze, none, V...    

In [81]:
from gensim.models import Word2Vec

# Load the model
word2vec_model = Word2Vec.load(file_path_word2vec_model)
print(f"Model Vocabulary Size: {len(word2vec_model.wv.key_to_index)}")

# Test some words in the vocabulary
test_words = ['wine', 'food', 'sweet', 'acid']
for word in test_words:
    print(f"{word} in vocabulary: {word in word2vec_model.wv.key_to_index}")

Model Vocabulary Size: 36695
wine in vocabulary: True
food in vocabulary: True
sweet in vocabulary: True
acid in vocabulary: True


In [82]:
# Ensure indices between wine_aromas_nonaromas and descriptors match
if 'index' in df_aromas_nonaromas.columns and 'index' in df_descriptors.columns:
    unmatched_indices = set(df_descriptors['index']) - set(df_aromas_nonaromas['index'])
    if unmatched_indices:
        print(f"Unmatched indices: {unmatched_indices}")
    else:
        print("All indices match between wine_aromas_nonaromas and descriptors.")