In [3]:
import json
import os

In [6]:
with open("../../../data_cleaned_sample.json") as f:
    wines = json.load(f)

print("Saving sample...")
with open("../../../data_cleaned_sample_2.json", "w") as f:
    json.dump(wines[:1], f)
print("Sample saved!")

print(f"Found {len(wines)} wines.")

Saving sample...
Sample saved!
Found 10 wines.


## Load the data

In [2]:
with open("../../../data.json") as f:
    wines = json.load(f)

print(f"Found {len(wines)} wines.")

Found 77011 wines.


## Remove useless infos

* Removed `prices` column because it is the same as `price`.
* Removed `seo_name` as we won't use it.
* Removed `background_image` as we won't use it.
* Removed `status` as we won't use it.
* Removed `image` as we won't use it.
* Removed `grapes` as it is consistently empty.
* Removed `has_valid_ratings` as we can deduce it from `statistics`.
* Removed `style` as we won't use it.
* Removed `vintage_type` as I suspect it to be a duplicate of `vintage`.

In [4]:
for i, wine in enumerate(wines):
    wine.pop('prices', True)
    # vintage
    wine['vintage'].pop('seo_name', True)
    wine['vintage'].pop('image', True)
    wine['vintage'].pop('grapes', True)
    wine['vintage'].pop('has_valid_ratings', True)
    # Wine
    wine['vintage']['wine'].pop('seo_name', True)
    wine['vintage']['wine'].pop('type_id', True)
    wine['vintage']['wine'].pop('has_valid_ratings', True)
    wine['vintage']['wine'].pop('style', True)
    wine['vintage']['wine'].pop('vintage_type', True)
    # Region
    if wine['vintage']['wine'].get('region'):
        wine['vintage']['wine']['region'].pop('seo_name', True)
        wine['vintage']['wine']['region'].pop('name_en', True)
        # Can not have a class
        wine['vintage']['wine']['region'].pop('class', True)
        wine['vintage']['wine']['region'].pop('background_image', True)
        # country 
        wine['vintage']['wine']['region']['country'].pop('native_name', True)
        wine['vintage']['wine']['region']['country'].pop('seo_name', True)
        wine['vintage']['wine']['region']['country'].pop('currency', True)
        for grape in wine['vintage']['wine'].get('region', {})['country']['most_used_grapes']:
            grape.pop('seo_name', True)
            grape.pop('has_detailed_info', True)
    # Winery
    wine['vintage']['wine']['winery'].pop('seo_name', True)
    wine['vintage']['wine']['winery'].pop('background_image', True)
    wine['vintage']['wine']['winery'].pop('status', True)
    # Price
    wine['vintage']['wine']['url'] = wine.get('price', {}).get('url')
    wine['price']['bottle_volume_ml'] = wine.get('price', {}).get('bottle_type', {}).get('volume_ml')
    wine['price'].pop('bottle_type', True)
    wine['price']['amount_in_euros'] = wine.get('price', {}).get('amount')
    wine['price'].pop('amount', True)
    wine['price'].pop('currency', True)
    wine['price'].pop('id', True)
    wine['price'].pop('type', True)
    wine['price'].pop('sku', True)
    wine['price'].pop('url', True)
    wine['price'].pop('visibility', True)
    wine['price'].pop('bottle_type_id', True)
    
    if wine['vintage'].get('top_list_rankings'):
        # Top list ranking
        for rank in wine['vintage']['top_list_rankings']:
            rank.pop('description', True)
            rank['top_list'].pop('seo_name', True)
            rank['top_list'].pop('type', True)
            # Most of the time empty
            rank['top_list'].pop('year', True)

# Save cleaned file
with open("../../../data_cleaned.json", "w") as f:
    json.dump(wines, f)
print("Cleaned data saved!")

# By deleting data we don't need we divide the file's size by almost 2!
print(f"Before cleaning: {os.stat('../../../data.json').st_size // (1024 * 1024)} Mb")
print(f"After cleaning {os.stat('../../../data_cleaned.json').st_size // (1024 * 1024)} Mb")

Cleaned data saved!
Before cleaning: 924 Mb
After cleaning 415 Mb


## Remove duplicates
During the scraping process, I try to minimize as much as I can the number of duplicates by dividing the price ranges. There is a hard limit on your requets that makes their API loop when you are iterating over too much pages. 
*(See comments in price_range_extractor.py)*

Still, few duplicates can endup in the dataset. The following code will remove them. To ensure those are duplicate, I check if they have the same `vintage` **and** `wine` ID. 

In [5]:
unique_ids = []
non_duplicate = 0
duplicate = 0
wine_without_duplicates = []

for i, wine in enumerate(wines):
    wine_id = wine['vintage']['wine']['id']
    vintage_id = wine['vintage']['id']
    # Create a unique ID for each combo wine+vintage
    unique_id = f"{wine_id}-{vintage_id}"

    if unique_id not in unique_ids:
        non_duplicate += 1
        unique_ids.append(unique_id)
        wine_without_duplicates.append(wine)
    else:
        duplicate += 1

print(f"Found {non_duplicate} non duplicates")
print(f"Found {duplicate} duplicates")

print("Saving without duplicates...")
with open("../../../data_cleaned.json", "w") as f:
    json.dump(wine_without_duplicates, f)
print("without duplicate saved!")

Found 74969 non duplicates
Found 2042 duplicates
Saving without duplicates...
without duplicate saved!


## Creating sample data
The final json file can be quiet heavy which makes it hard to inspect manually the data. Which can be usefull to analyse the json schema for example.

To allow you to quickly inspect the data, I create a sample file that containe only the first elements of the list.

In [6]:
print("Saving sample...")
with open("../../../data_cleaned_sample.json", "w") as f:
    json.dump(wines[:10], f)
print("Sample saved!")

Saving sample...
Sample saved!
