In [1]:
#Importar las librerias que en principio utilizaria para analizar mi dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid", palette="GnBu_d")

**Carga de datos:** Lectura de CSV en un DataFrame (dado que es un conjunto único de datos).


In [2]:
#Importo el dataset
wine_reviews = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
country_continent = {'Argentina': 4,
'Armenia': 6,
'Australia': 7,
'Austria': 6,
'Bosnia and Herzegovina': 1,
'Brazil': 4,
'Bulgaria': 6,
'Canada': 3,
'Chile': 4,
'China': 1,
'Croatia': 6,
'Cyprus': 6,
'Czech Republic': 6,
'Egypt': 2,
'England': 6,
'France': 6,
'Georgia': 6,
'Germany': 6,
'Greece': 6,
'Hungary': 6,
'India': 1,
'Israel': 6,
'Italy': 6,
'Lebanon': 1,
'Luxembourg': 6,
'Macedonia': 6,
'Mexico': 3,
'Moldova': 6,
'Morocco': 2,
'New Zealand': 7,
'Peru': 4,
'Portugal': 6,
'Romania': 6,
'Serbia': 6,
'Slovakia': 6,
'Slovenia': 6,
'South Africa': 2,
'Spain': 6,
'Switzerland': 6,
'Turkey': 1,
'US': 3,
'Ukraine': 6,
'Uruguay': 4,
}

In [4]:
#tamaño del dataset
wine_reviews.shape

(129971, 14)

In [5]:
#primeras instancias de mi dataset
wine_reviews.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [6]:
q1 = wine_reviews['price'].quantile(0.25)
q3 = wine_reviews['price'].quantile(0.75)
iqr = q3 -q1

print(q1, q3)
wine_reviews = wine_reviews[(wine_reviews['price'] > (1.5* iqr - q1)) & (wine_reviews['price'] < (1.5*iqr + q3))]

17.0 42.0


In [7]:
wine_reviews.price.describe()

count    67393.000000
mean        38.769323
std         14.282054
min         21.000000
25%         27.000000
50%         35.000000
75%         48.000000
max         79.000000
Name: price, dtype: float64

In [8]:
wine_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67393 entries, 4 to 129970
Data columns (total 14 columns):
Unnamed: 0               67393 non-null int64
country                  67353 non-null object
description              67393 non-null object
designation              51030 non-null object
points                   67393 non-null int64
price                    67393 non-null float64
province                 67353 non-null object
region_1                 59656 non-null object
region_2                 34454 non-null object
taster_name              52512 non-null object
taster_twitter_handle    49990 non-null object
title                    67393 non-null object
variety                  67393 non-null object
winery                   67393 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 7.7+ MB


- **continent:** Indicar con categorías numéricas el continente de dónde proviene el vino (usando la convención mencionada [aquí](https://en.wikipedia.org/wiki/Continent)).

In [9]:
wine_reviews['continent'] = wine_reviews['country'].map(country_continent)
wine_reviews.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,continent
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,3.0
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,6.0


- **is_sweet:** Columna booleana indicando si la descripción del vino contiene o no la palabra *sweet* (quizás con la ayuda de una [función útil en Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html))

In [10]:
is_sweet = wine_reviews['description'].str.contains('sweet', regex=False)
wine_reviews['is_sweet'] = is_sweet

In [11]:
wine_reviews.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,continent,is_sweet
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,3.0,False
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,6.0,False


- **price_4levels:** Columna categórica indicando el precio en 4 tipos: "LOW" (precio menor o igual a \\$15), "INTERMEDIATE" (precio entre \\$15 y \\$40), "HIGH" (precio entre \\$40 y \\$70) y "VERY HIGH" (precio mayor a \\$70).

In [12]:
def price_4levels(x):
    if x > 70:
        res = "VERY HIGH"
    elif (x < 70) & (x >= 40):
        res = "HIGH"
    elif (x < 40) & (x >= 15):
        res = "INTERMEDIATE"
    elif (x < 15) & (x >= 0):
        res = "LOW"
    else:
        res = "NOT DEFINED"
    return res    
        
wine_reviews['price_4levels'] = wine_reviews.apply(lambda x: price_4levels(x['price']), axis = 1)

In [13]:
wine_reviews.head(4)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,continent,is_sweet,price_4levels
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,3.0,False,HIGH
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,6.0,False,INTERMEDIATE
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,6.0,False,INTERMEDIATE
11,11,France,"This is a dry wine, very spicy, with a tight, ...",,87,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Leon Beyer 2012 Gewurztraminer (Alsace),Gewürztraminer,Leon Beyer,6.0,False,INTERMEDIATE


- **was_master_taster**: Queremos una columna booleana para saber si la revisión del vino analizado fue provista por uno de los "4 maestros del vino": "Roger Voss", "Michael Schachner", "Paul Gregutt", "Virginie Boone".

In [14]:
def was_master_taster(x):
    if x == "Roger Voss":
        res = True
    elif x == "Michael Schachner":
        res = True
    elif x == "Paul Gregutt":
        res = True
    elif x == "Virginie Boone":
        res = True
    else:
        res = False
    return res    
        
wine_reviews['was_master_taster'] = wine_reviews.apply(lambda x: was_master_taster(x['taster_name']), axis = 1)

In [15]:
wine_reviews.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,continent,is_sweet,price_4levels,was_master_taster
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,3.0,False,HIGH,True
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,6.0,False,INTERMEDIATE,True
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,6.0,False,INTERMEDIATE,True


- **century_title:** Asumiendo que el título del vino frecuentemente contiene el año del mismo, se debe proveer una columna booleana indicando si fue producido el siglo pasado (1900-1999) o el actual (2000-2019). Quizás puede servir la [siguiente función de Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html) mediante esta regex: `r'.*(\d{4}).*'`.

In [None]:
def century_title(x):
    if (x >) :
        res = True
    elif x == "Michael Schachner":
        res = True
    elif x == "Paul Gregutt":
        res = True
    elif x == "Virginie Boone":
        res = True
    else:
        res = False
    return res    
        
wine_reviews['was_master_taster'] = wine_reviews.apply(lambda x: was_master_taster(x['taster_name']), axis = 1)