# Data cleaning and preparation

## Import libraries and load data

In [195]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

import re

%matplotlib inline

wine = pd.read_csv('data/winemag-data-130k-v2.csv')

In [196]:
wine.isna().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [197]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


## Functions

In [198]:
def headers_to_lowercase (df):
    df.columns = df.columns.str.lower()
    return df

def text_to_lowercase (df):
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    return df   

def drop_columns (df, list_of_columns_to_drop):    
    df.drop(columns = list_of_columns_to_drop, inplace=True)
    return df

def change_col_names (df, column_name_dict):
    df.rename(columns=column_name_dict, inplace= True)
    return df

def replace_nans_in_price_with_winery_median (df):
    df['price_in_usd'] = df['price_in_usd'].fillna(df.groupby('winery')['price_in_usd'].transform('median'))
    return df

def drop_rows_with_nans (df, col_list_for_row_drop):
    for i in col_list_for_row_drop:
        df.drop(df[df[i].isna()].index, inplace=True)
    return df

def change_float_to_int(df):
    df['price_in_usd']=df['price_in_usd'].astype(int)
    return df

## Variables for data cleaning

In [199]:
col_list_for_row_drop = ['vineyard','region','price_in_usd']

list_of_columns_to_drop = ['unnamed: 0','region_2','taster_name','taster_twitter_handle']

column_name_dict = {'designation':'vineyard','price':'price_in_usd','region_1':'region','title':'wine_name'}

## Cleaning pipeline

In [200]:
wine = (wine
.pipe(headers_to_lowercase)
.pipe(text_to_lowercase)
.pipe(drop_columns, list_of_columns_to_drop)
.pipe(change_col_names, column_name_dict)
.pipe(replace_nans_in_price_with_winery_median)
.pipe(drop_rows_with_nans, col_list_for_row_drop)
.pipe(change_float_to_int)
)

In [201]:
wine.isna().sum()

country         0
description     0
vineyard        0
points          0
price_in_usd    0
province        0
region          0
wine_name       0
variety         0
winery          0
dtype: int64

In [202]:
wine

Unnamed: 0,country,description,vineyard,points,price_in_usd,province,region,wine_name,variety,winery
0,italy,"aromas include tropical fruit, broom, brimston...",vulkà bianco,87,17,sicily & sardinia,etna,nicosia 2013 vulkà bianco (etna),white blend,nicosia
3,us,"pineapple rind, lemon pith and orange blossom ...",reserve late harvest,87,13,michigan,lake michigan shore,st. julian 2013 reserve late harvest riesling ...,riesling,st. julian
4,us,"much like the regular bottling from 2012, this...",vintner's reserve wild child block,87,65,oregon,willamette valley,sweet cheeks 2012 vintner's reserve wild child...,pinot noir,sweet cheeks
5,spain,blackberry and raspberry aromas show a typical...,ars in vitro,87,15,northern spain,navarra,tandem 2011 ars in vitro tempranillo-merlot (n...,tempranillo-merlot,tandem
6,italy,"here's a bright, informal red that opens with ...",belsito,87,16,sicily & sardinia,vittoria,terre di giurfo 2013 belsito frappato (vittoria),frappato,terre di giurfo
...,...,...,...,...,...,...,...,...,...,...
129962,italy,"blackberry, cassis, grilled herb and toasted a...",sàgana tenuta san giacomo,90,40,sicily & sardinia,sicilia,cusumano 2012 sàgana tenuta san giacomo nero d...,nero d'avola,cusumano
129964,france,"initially quite muted, this wine slowly develo...",domaine saint-rémy herrenweg,90,24,alsace,alsace,domaine ehrhart 2013 domaine saint-rémy herren...,gewürztraminer,domaine ehrhart
129965,france,"while it's rich, this beautiful dry wine also ...",seppi landmann vallée noble,90,28,alsace,alsace,domaine rieflé-landmann 2013 seppi landmann va...,pinot gris,domaine rieflé-landmann
129968,france,well-drained gravel soil gives this wine its c...,kritt,90,30,alsace,alsace,domaine gresser 2013 kritt gewurztraminer (als...,gewürztraminer,domaine gresser


In [203]:
wine.to_csv('data/wine_cleaned_dataframe.csv', index=False)