In [1]:
# Importing dependencies
import pandas as pd
from sqlalchemy import create_engine
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
import numpy as np

In [2]:
# Creating dataframe from csv file
csv_file = "Resources/wine_data.csv"
wine_df=pd.read_csv(csv_file, encoding="utf-8")
wine_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
#Count of non-null records within each column
wine_df.count()

Unnamed: 0               129971
country                  129908
description              129971
designation               92506
points                   129971
price                    120975
province                 129908
region_1                 108724
region_2                  50511
taster_name              103727
taster_twitter_handle     98758
title                    129971
variety                  129970
winery                   129971
dtype: int64

In [4]:
# Resetting index
wine_df.reset_index()
# Removed unnecessary columns (Description and Designation) and created a copy
new_wine_data = wine_df[["country", "points", "price", "province", "region_1", "region_2", "taster_name",\
                        "taster_twitter_handle", "title", "variety", "winery"]].copy()
new_wine_data.head()

Unnamed: 0,country,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
# Checking data types for each column
new_wine_data.dtypes

country                   object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

In [6]:
# Finding the number of unique values in each columns
new_wine_data.nunique()

country                      43
points                       21
price                       390
province                    425
region_1                   1229
region_2                     17
taster_name                  19
taster_twitter_handle        15
title                    118840
variety                     707
winery                    16757
dtype: int64

In [7]:
# Check for NaN in country column
new_wine_data[new_wine_data["country"].isna()==True]

Unnamed: 0,country,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
913,,87,30.0,,,,Mike DeSimone,@worldwineguys,Gotsa Family Wines 2014 Asureti Valley Chinuri,Chinuri,Gotsa Family Wines
3131,,83,,,,,Roger Voss,@vossroger,Barton & Guestier NV Partager Red,Red Blend,Barton & Guestier
4243,,88,18.0,,,,Mike DeSimone,@worldwineguys,Kakhetia Traditional Winemaking 2012 Red Natur...,Ojaleshi,Kakhetia Traditional Winemaking
9509,,92,28.0,,,,Susan Kostrzewa,@suskostrzewa,Tsililis 2015 Theopetra Malagouzia-Assyrtiko W...,White Blend,Tsililis
9750,,89,28.0,,,,Jeff Jenssen,@worldwineguys,Ross-idi 2015 Orange Nikolaevo Vineyard Chardo...,Chardonnay,Ross-idi
...,...,...,...,...,...,...,...,...,...,...,...
124176,,90,30.0,,,,Jeff Jenssen,@worldwineguys,Les Frères Dutruy 2014 Les Romaines Red,Red Blend,Les Frères Dutruy
129407,,89,22.0,,,,Michael Schachner,@wineschach,El Capricho 2015 Reserve Cabernet Sauvignon,Cabernet Sauvignon,El Capricho
129408,,89,22.0,,,,Michael Schachner,@wineschach,El Capricho 2015 Reserve Tempranillo,Tempranillo,El Capricho
129590,,90,30.0,,,,Mike DeSimone,@worldwineguys,Büyülübağ 2012 Shah Red,Red Blend,Büyülübağ


In [8]:
# Dropped null records in country column
new_wine_data["country"].dropna(inplace=True)
new_wine_data.head()

Unnamed: 0,country,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [9]:
#Checking if null values in country column has been dropped
new_wine_data["country"].isna().sum()

0

In [10]:
#Creating a year column from title column
new_wine_data["year"]=new_wine_data["title"].str.extract('(\d+)')
new_wine_data.head()

Unnamed: 0,country,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,year
0,Italy,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013
1,Portugal,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,US,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [11]:
#Creating a location column by combining country, province, region_1, region_2
new_wine_data["location"]=new_wine_data["region_1"].fillna('_').map(str)\
+ ", "+ new_wine_data["region_2"].fillna('_')\
+ ", "+ new_wine_data["province"].fillna('_')\
+ ", "+ new_wine_data["country"]

new_wine_data.head()

Unnamed: 0,country,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,year,location
0,Italy,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,"Etna, _, Sicily & Sardinia, Italy"
1,Portugal,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,"_, _, Douro, Portugal"
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,"Willamette Valley, Willamette Valley, Oregon, US"
3,US,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,"Lake Michigan Shore, _, Michigan, US"
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,"Willamette Valley, Willamette Valley, Oregon, US"


In [12]:
#URL to fetch secondary data- grape variety by color
url="https://en.wikipedia.org/wiki/List_of_grape_varieties#Red_table_grapes"

In [13]:
#Reading the data from html into a list of tables
tables=pd.read_html(url)

In [14]:
#Creating data frames for each grape color
#Dataframe for red grapes
grapes_red_df=tables[0]
#Dataframe for white grapes
grapes_white_df=tables[1]
#Dataframe for rose' grapes
grapes_rose_df=tables[2]

In [15]:
#Assigning the grape_color column for red grapes
grapes_red_df["grape_color"]="red"

#Assigning the grape_color column for white grapes
grapes_white_df["grape_color"]="white"

#Assigning the grape_color column for rose
grapes_rose_df["grape_color"]="rose"

In [16]:
#Combining the three different grape color dataframes into one 
grape_color_df=pd.concat([grapes_red_df,grapes_white_df,grapes_rose_df])
grape_color_df["grape_color"].value_counts()

red      426
white    394
rose       2
Name: grape_color, dtype: int64

In [17]:
# Merging wine dataframe with grape color dataframe
wine_review_df=new_wine_data.merge(grape_color_df, left_on="variety",right_on="Common Name(s)",how="left")

In [18]:
#Creating a wine review dataframe with the grape color column added for red color grapes
wine_review_df=wine_review_df[["country", "points", "price", "province", "region_1", "region_2","location", "taster_name",\
                        "taster_twitter_handle", "title","year","variety", "winery","grape_color"]]


In [19]:
#Checking for null values in the grape_color column
wine_review_df[wine_review_df["grape_color"].isna()==True]

Unnamed: 0,country,points,price,province,region_1,region_2,location,taster_name,taster_twitter_handle,title,year,variety,winery,grape_color
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129968,Germany,90,28.0,Mosel,,,"_, _, Mosel, Germany",Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,2013,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),
129969,US,90,75.0,Oregon,Oregon,Oregon Other,"Oregon, Oregon Other, Oregon, US",Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),2004,Pinot Noir,Citation,
129970,France,90,30.0,Alsace,Alsace,,"Alsace, _, Alsace, France",Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,2013,Gewürztraminer,Domaine Gresser,
129971,France,90,32.0,Alsace,Alsace,,"Alsace, _, Alsace, France",Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),2012,Pinot Gris,Domaine Marcel Deiss,


In [20]:
#Checking the count of values for each grape color
wine_review_df["grape_color"].value_counts()

red      22892
white    16053
Name: grape_color, dtype: int64

In [21]:
#Rename columns in the data set to match column names in the table
wine_review_df.columns=["country_name","wine_score","wine_price","province","region1","region2",\
                        "combined_region","taster_name","taster_twitter","wine_name","wine_year","grape_name",\
                        "winery_name", "grape_color"]

In [22]:
wine_review_df.head()

Unnamed: 0,country_name,wine_score,wine_price,province,region1,region2,combined_region,taster_name,taster_twitter,wine_name,wine_year,grape_name,winery_name,grape_color
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,


In [23]:
#Creating country_df that will be used to load country table into the database
country_df=pd.DataFrame(wine_review_df["country_name"].unique())

#Set index to start at 1
country_df.index = country_df.index + 1

In [24]:
#Resetting the index to create primary key for table
country_df=country_df.reset_index()

In [25]:
#Renaming columns to match column names in country table
country_df.columns=["country_id","country_name"]
country_df.head()

Unnamed: 0,country_id,country_name
0,1,Italy
1,2,Portugal
2,3,US
3,4,Spain
4,5,France


In [26]:
#Creating taster_df that will be used to load taster table into the database
taster_df=wine_review_df[["taster_name", "taster_twitter"]].drop_duplicates().reset_index().drop(columns=["index"])

#Set index to start at 1
taster_df.index = taster_df.index + 1

In [27]:
#Resetting the index to create primary key for table
taster_df=taster_df.reset_index()

In [28]:
#Renaming columns to match column names in taster table
taster_df.columns=["taster_id","taster_name","taster_twitter"]
taster_df.head()

Unnamed: 0,taster_id,taster_name,taster_twitter
0,1,Kerin O’Keefe,@kerinokeefe
1,2,Roger Voss,@vossroger
2,3,Paul Gregutt,@paulgwine
3,4,Alexander Peartree,
4,5,Michael Schachner,@wineschach


In [29]:
#Creating winery_df that will be used to load winery table into the database
winery_df=wine_review_df[["winery_name"]].drop_duplicates().reset_index().drop(columns=["index"])

#Set index to start at 1
winery_df.index = winery_df.index + 1

In [30]:
#Resetting the index to create primary key for table
winery_df=winery_df.reset_index()

In [31]:
#Renaming columns to match column names in winery table
winery_df.columns=["winery_id","winery_name"]
winery_df.head()

Unnamed: 0,winery_id,winery_name
0,1,Nicosia
1,2,Quinta dos Avidagos
2,3,Rainstorm
3,4,St. Julian
4,5,Sweet Cheeks


In [32]:
#Creating grape_variety_df that will be used to load grape_variety table into the database
grape_variety_df=wine_review_df[["grape_name","grape_color"]].drop_duplicates().reset_index().drop(columns=["index"])

#Set index to start at 1
grape_variety_df.index = grape_variety_df.index + 1

In [33]:
#Resetting the index to create primary key for table
grape_variety_df=grape_variety_df.reset_index()


In [34]:
#Renaming columns to match column names in winery table
grape_variety_df.columns=["grape_id","grape_name","grape_color"]
grape_variety_df.head()

Unnamed: 0,grape_id,grape_name,grape_color
0,1,White Blend,
1,2,Portuguese Red,
2,3,Pinot Gris,
3,4,Riesling,
4,5,Pinot Noir,


In [35]:
#Creating region_df that will be used to load region table into the database
region_df=wine_review_df[["country_name","province","region1","region2","combined_region"]].drop_duplicates()\
.reset_index().drop(columns=["index"])

#Set index to start at 1
region_df.index = region_df.index + 1

In [36]:
#Resetting the index to create primary key for table
region_df=region_df.reset_index()

In [37]:
#Renaming columns to match column names in region table
region_df.columns=["region_id","country_name","province","region1","region2","combined_region"]
region_df.head()

Unnamed: 0,region_id,country_name,province,region1,region2,combined_region
0,1,Italy,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy"
1,2,Portugal,Douro,,,"_, _, Douro, Portugal"
2,3,US,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US"
3,4,US,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US"
4,5,Spain,Northern Spain,Navarra,,"Navarra, _, Northern Spain, Spain"


In [38]:
#Creating a province dataframe of unique provinces and will be used to find latitude and longitudes for the provinces
province_df=pd.DataFrame(region_df["province"].unique())
province_df.columns=["province"]
province_df.head()

Unnamed: 0,province
0,Sicily & Sardinia
1,Douro
2,Oregon
3,Michigan
4,Northern Spain


In [None]:
# Calculating latitude and longtitude for provinces in province dataframe and assigning to latitude and longitude list

longitude = [] 
latitude = [] 
   
# function to find the coordinate 
# of a given city  
def findGeocode(province): 
       
    # try and catch is used to overcome 
    # the exception thrown by geolocator 
    # using geocodertimedout   
    #try: 
          
    # Specify the user_agent as your 
    # app name it should not be none 
    geolocator = Nominatim(user_agent="wine_review") 

    return geolocator.geocode(province,timeout=None) 
      
    #except GeocoderTimedOut: 
          
        #return findGeocode(province)     
  
    # each value from city column 
    # will be fetched and sent to 
    # function find_geocode    
for i in (province_df["province"]): 
      
    if findGeocode(i) != None: 
           
        loc = findGeocode(i) 
          
        # coordinates returned from  
        # function is stored into 
        # two separate list 
        latitude.append(loc.latitude) 
        longitude.append(loc.longitude) 
       
    # if coordinate for a city not 
    # found, insert "NaN" indicating  
    # missing value  
    else: 
        latitude.append(np.nan) 
        longitude.append(np.nan) 

In [None]:
#Assigning latitudes and longitudes created for province to respective columns in province dataframe
province_df["latitude"]=latitude
province_df["longitude"]=latitude

In [39]:
#Merging region and province dataframe to add latitude and longitude columns to region dataframe
region_df=region_df.merge(region_df.merge(province_df, how='inner', on='province', sort=False))
region_df.head()

Unnamed: 0,region_id,country_name,province,region1,region2,combined_region
0,1,Italy,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy"
1,2,Portugal,Douro,,,"_, _, Douro, Portugal"
2,3,US,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US"
3,4,US,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US"
4,5,Spain,Northern Spain,Navarra,,"Navarra, _, Northern Spain, Spain"


In [40]:
#Merging region and country dataframe to add country_id column to region dataframe and dropping country_name column
region_df=region_df.merge(region_df.merge(country_df, how='inner', on='country_name', sort=False))\
.drop(columns=["country_name"])
region_df.head()

Unnamed: 0,region_id,province,region1,region2,combined_region,country_id
0,1,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",1
1,2,Douro,,,"_, _, Douro, Portugal",2
2,3,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",3
3,4,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",3
4,5,Northern Spain,Navarra,,"Navarra, _, Northern Spain, Spain",4


In [41]:
#Creating wine_review_data_df dataframe from wine_review_df dataframe and adding references to other dataframe

#Adding grape_id column in wine_review_data_df which is referenced to grape dataframe
wine_review_data_df=wine_review_df.merge(wine_review_df.merge(grape_variety_df,on=["grape_name","grape_color"],sort=False))
wine_review_data_df.head()

Unnamed: 0,country_name,wine_score,wine_price,province,region1,region2,combined_region,taster_name,taster_twitter,wine_name,wine_year,grape_name,winery_name,grape_color,grape_id
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,,1
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,,2
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,,3
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,,4
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,,5


In [42]:
#Adding region_id column in wine_review_data_df which is referenced to region dataframe
wine_review_data_df=wine_review_data_df.merge(wine_review_data_df.merge(region_df[["region_id","combined_region"]]\
                                                              ,on="combined_region",sort=False))
wine_review_data_df.head()

Unnamed: 0,country_name,wine_score,wine_price,province,region1,region2,combined_region,taster_name,taster_twitter,wine_name,wine_year,grape_name,winery_name,grape_color,grape_id,region_id
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,,1,1
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,,2,2
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,,3,3
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,,4,4
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,,5,3


In [43]:
#Adding taster_id column in wine_review_data_df which is referenced to taster dataframe
wine_review_data_df=wine_review_data_df.merge(wine_review_data_df.merge(taster_df,\
                                                                        on=["taster_name","taster_twitter"],sort=False))
wine_review_data_df.head()

Unnamed: 0,country_name,wine_score,wine_price,province,region1,region2,combined_region,taster_name,taster_twitter,wine_name,wine_year,grape_name,winery_name,grape_color,grape_id,region_id,taster_id
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,,1,1,1
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,,2,2,2
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,,3,3,3
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,,4,4,4
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,,5,3,3


In [None]:
# #Adding winery_id column in wine_review_data_df which is referenced to winery dataframe
# wine_review_data_df=wine_review_data_df.merge(wine_review_data_df.merge(winery_df,on="winery_name",sort=False))
# wine_review_data_df.head()

In [55]:
#Adding winery_id column in wine_review_data_df which is referenced to winery dataframe
final_wine_data=pd.merge(wine_review_data_df, winery_df, on="winery_name", how="left", sort=False)
final_wine_data.head()

Unnamed: 0,country_name,wine_score,wine_price,province,region1,region2,combined_region,taster_name,taster_twitter,wine_name,wine_year,grape_name,winery_name,grape_color,grape_id,region_id,taster_id,winery_id
0,Italy,87,,Sicily & Sardinia,Etna,,"Etna, _, Sicily & Sardinia, Italy",Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),2013,White Blend,Nicosia,,1,1,1,1
1,Portugal,87,15.0,Douro,,,"_, _, Douro, Portugal",Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),2011,Portuguese Red,Quinta dos Avidagos,,2,2,2,2
2,US,87,14.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,Pinot Gris,Rainstorm,,3,3,3,3
3,US,87,13.0,Michigan,Lake Michigan Shore,,"Lake Michigan Shore, _, Michigan, US",Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,Riesling,St. Julian,,4,4,4,4
4,US,87,65.0,Oregon,Willamette Valley,Willamette Valley,"Willamette Valley, Willamette Valley, Oregon, US",Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,Pinot Noir,Sweet Cheeks,,5,3,3,5
