In [23]:
import json
import os

## Load the data

In [24]:
with open("../../../data.json") as f:
    wines = json.load(f)

print(f"Found {len(wines)} wines.")

Found 78900 wines.


In [25]:
print("Save Raw data...")
with open("../../../data_raw.json", "w") as f:
    json.dump(wines, f)
print("Raw data saved!")

Save Raw data...
Raw data saved!


## Remove useless infos

* Removed `prices` column because it is the same as `price`.
* Removed `seo_name` as we won't use it.
* Removed `background_image` as we won't use it.
* Removed `status` as we won't use it.
* Removed `image` as we won't use it.
* Removed `grapes` as it is consistently empty.
* Removed `has_valid_ratings` as we can deduce it from `statistics`.
* Removed `style` as we won't use it.
* Removed `vintage_type` as I suspect it to be a duplicate of `vintage`.

In [26]:
wines[0]

{'vintage': {'id': 8519046,
  'seo_name': 'franco-biondi-santi-brunello-di-montalcino-riserva-1955',
  'name': 'Biondi-Santi Brunello di Montalcino Riserva 1955',
  'statistics': {'status': 'Normal',
   'ratings_count': 35,
   'ratings_average': 4.9,
   'labels_count': 129,
   'wine_ratings_count': 3375,
   'wine_ratings_average': 4.6,
   'wine_status': 'Normal'},
  'image': {'location': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pl_480x640.png',
   'variations': {'bottle_large': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pb_x960.png',
    'bottle_medium': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pb_x600.png',
    'bottle_medium_square': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pb_600x600.png',
    'bottle_small': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pb_x300.png',
    'bottle_small_square': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pb_300x300.png',
    'label': '//images.vivino.com/thumbs/C62n7YRySmOweMGqGhHUwA_pl_480x640.png',

In [27]:
for i, wine in enumerate(wines):
    wine.pop('prices', True)
    # vintage
    wine['vintage'].pop('seo_name')
    wine['vintage'].pop('image')
    wine['vintage'].pop('grapes')
    wine['vintage'].pop('has_valid_ratings')
    # Wine
    wine['vintage']['wine'].pop('seo_name')
    wine['vintage']['wine'].pop('type_id')
    wine['vintage']['wine'].pop('has_valid_ratings')
    wine['vintage']['wine'].pop('style')
    wine['vintage']['wine'].pop('vintage_type')
    # Region
    wine['vintage']['wine']['region'].pop('seo_name')
    wine['vintage']['wine']['region'].pop('name_en')
    # Can not have a class
    wine['vintage']['wine']['region'].pop('class', True)
    wine['vintage']['wine']['region'].pop('background_image')
    # country 
    wine['vintage']['wine']['region']['country'].pop('native_name')
    wine['vintage']['wine']['region']['country'].pop('seo_name')
    wine['vintage']['wine']['region']['country'].pop('currency')
    for grape in wine['vintage']['wine']['region']['country']['most_used_grapes']:
        grape.pop('seo_name')
        grape.pop('has_detailed_info')
    # Winery
    wine['vintage']['wine']['winery'].pop('seo_name')
    wine['vintage']['wine']['winery'].pop('background_image')
    wine['vintage']['wine']['winery'].pop('status')
    # Price
    wine['vintage']['wine']['url'] = wine['price']['url']
    wine['price']['bottle_volume_ml'] = wine['price']['bottle_type']['volume_ml']
    wine['price'].pop('bottle_type')
    wine['price']['amount_in_euros'] = wine['price']['amount']
    wine['price'].pop('amount')
    wine['price'].pop('currency')
    wine['price'].pop('id')
    wine['price'].pop('type')
    wine['price'].pop('sku')
    wine['price'].pop('url')
    wine['price'].pop('visibility')
    wine['price'].pop('bottle_type_id')
    
    if wine['vintage'].get('top_list_rankings'):
        # Top list ranking
        for rank in wine['vintage']['top_list_rankings']:
            rank.pop('description')
            rank['top_list'].pop('seo_name')
            rank['top_list'].pop('type')
            # Most of the time empty
            rank['top_list'].pop('year')


In [28]:
print("all done! Saving results...")
with open("../../../data_cleaned.json", "w") as f:
    json.dump(wines, f)
print("results saved!")

all done! Saving results...
results saved!


In [29]:
print(f"Before cleaning: {os.stat('../../../data.json').st_size // (1024 * 1024)} Mb")
print(f"After cleaning {os.stat('../../../data_cleaned.json').st_size // (1024 * 1024)} Mb")

Before cleaning: 1098 Mb
After cleaning 572 Mb


In [30]:
print("Saving sample...")
with open("../../../data_sample_cleaned.json", "w") as f:
    json.dump(wines[:100], f)
print("Sample saved!")

Saving sample...
Sample saved!
