# Data cleaning and preparation

## Import libraries and load data

In [133]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

import re

%matplotlib inline

wine = pd.read_csv('data/winemag-data-130k-v2.csv')

## Functions

In [140]:
def headers_to_lowercase (df):
    df.columns = df.columns.str.lower()
    return df

def text_to_lowercase (df):
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    return df   

def drop_columns (df, list_of_columns_to_drop):    
    df.drop(columns = list_of_columns_to_drop, inplace=True)
    return df

#def drop_rows_with_nans (df, country_drop_list):
 #   df = df[~df['country'].isin(country_drop_list)]
  #  return df

def change_col_names (df, column_name_dict):
    df.rename(columns=column_name_dict, inplace= True)
    return df

def replace_nans_in_price_with_winery_median (df):
    df['price_in_usd'] = df['price_in_usd'].fillna(df.groupby('winery')['price_in_usd'].transform('median'))
    return df

def drop_na(df):
    df.dropna()
    return df

def change_float_to_int(df):
    df['price_in_usd']=df['price_in_usd'].astype(int)
    return df

## Variables for data cleaning

In [141]:
list_of_columns_to_drop = ['unnamed: 0','region_2','taster_name','taster_twitter_handle']

column_name_dict = {'designation':'vineyard','price':'price_in_usd','region_1':'region','title':'wine_name'}

In [143]:
# add function dropping rows with NaNs


wine = (wine
.pipe(headers_to_lowercase)
.pipe(text_to_lowercase)
.pipe(drop_columns, list_of_columns_to_drop)
.pipe(change_col_names, column_name_dict)
.pipe(replace_nans_in_price_with_winery_median)
.pipe(drop_na)
)

#.pipe(change_float_to_int)

In [145]:
wine.isna().sum()

country            63
description         0
vineyard        37465
points              0
price_in_usd     1686
province           63
region          21247
wine_name           0
variety             1
winery              0
dtype: int64

In [74]:
wine.drop(columns=['Unnamed: 0','region_2'], inplace=True)

In [75]:
wine

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [77]:
wine.drop(columns=['taster_name','taster_twitter_handle'], inplace=True)

In [79]:
wine.to_csv('data/wine_cleaned_dataframe.csv', index=False)

In [85]:
wine.drop(wine[wine['designation'].isna()].index, inplace=True)

In [90]:
wine.drop(wine[wine['region_1'].isna()].index, inplace=True)

In [93]:
wine

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
...,...,...,...,...,...,...,...,...,...,...
129962,Italy,"Blackberry, cassis, grilled herb and toasted a...",Sàgana Tenuta San Giacomo,90,40.0,Sicily & Sardinia,Sicilia,Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...,Nero d'Avola,Cusumano
129964,France,"Initially quite muted, this wine slowly develo...",Domaine Saint-Rémy Herrenweg,90,,Alsace,Alsace,Domaine Ehrhart 2013 Domaine Saint-Rémy Herren...,Gewürztraminer,Domaine Ehrhart
129965,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,90,28.0,Alsace,Alsace,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,Pinot Gris,Domaine Rieflé-Landmann
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser


In [92]:
wine.isna().sum()

country           0
description       0
designation       0
points            0
price          4861
province          0
region_1          0
title             0
variety           0
winery            0
dtype: int64

In [107]:
wine[wine['price'].isna()]['variety'].value_counts()

Bordeaux-style Red Blend     730
Chardonnay                   446
Nebbiolo                     394
Red Blend                    347
Pinot Noir                   312
                            ... 
Prié Blanc                     1
Picolit                        1
Colombard-Sauvignon Blanc      1
Roviello                       1
Gamay Noir                     1
Name: variety, Length: 172, dtype: int64

In [110]:
wine['price'] = wine['price'].fillna(wine.groupby('winery')['price'].transform('median'))

In [111]:
wine.isna().sum()

country           0
description       0
designation       0
points            0
price          1298
province          0
region_1          0
title             0
variety           0
winery            0
dtype: int64

In [112]:
wine

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,17.0,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
...,...,...,...,...,...,...,...,...,...,...
129962,Italy,"Blackberry, cassis, grilled herb and toasted a...",Sàgana Tenuta San Giacomo,90,40.0,Sicily & Sardinia,Sicilia,Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...,Nero d'Avola,Cusumano
129964,France,"Initially quite muted, this wine slowly develo...",Domaine Saint-Rémy Herrenweg,90,24.0,Alsace,Alsace,Domaine Ehrhart 2013 Domaine Saint-Rémy Herren...,Gewürztraminer,Domaine Ehrhart
129965,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,90,28.0,Alsace,Alsace,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,Pinot Gris,Domaine Rieflé-Landmann
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser


In [113]:
wine.drop(wine[wine['price'].isna()].index, inplace=True)

In [115]:
wine.isna().sum()

country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
title          0
variety        0
winery         0
dtype: int64

In [129]:
wine = headers_to_lowercase(wine)

In [128]:
wine = text_to_lowercase(wine)

In [121]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73738 entries, 0 to 129970
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      73738 non-null  object 
 1   description  73738 non-null  object 
 2   designation  73738 non-null  object 
 3   points       73738 non-null  int64  
 4   price        73738 non-null  float64
 5   province     73738 non-null  object 
 6   region_1     73738 non-null  object 
 7   title        73738 non-null  object 
 8   variety      73738 non-null  object 
 9   winery       73738 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 6.2+ MB


In [122]:
wine['price']=wine['price'].astype(int)

In [123]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73738 entries, 0 to 129970
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      73738 non-null  object
 1   description  73738 non-null  object
 2   designation  73738 non-null  object
 3   points       73738 non-null  int64 
 4   price        73738 non-null  int64 
 5   province     73738 non-null  object
 6   region_1     73738 non-null  object
 7   title        73738 non-null  object
 8   variety      73738 non-null  object
 9   winery       73738 non-null  object
dtypes: int64(2), object(8)
memory usage: 6.2+ MB


In [125]:
column_name_dict = {'designation':'vineyard','price':'price_in_usd','region_1':'region','title':'wine_name'}

def change_col_names (df, column_name_dict):
    df.rename(columns=column_name_dict, inplace= True)
    return df

In [127]:
wine = change_col_names(wine,column_name_dict)

In [131]:
wine.columns

Index(['country', 'description', 'vinyard', 'points', 'price', 'province',
       'region', 'wine_name', 'variety', 'winery'],
      dtype='object')