# Task 3 - Importing different datasets and visualization using plotly

## 1. Import Dataset and investigate contents

In [1]:
import pandas as pd

reviews = pd.read_csv('data/winemag-data-130k-v2.csv.zip', index_col = 0)
reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

Out of the columns above, there are a few that can be used for meaningful analysis and some are mostly irrelevant for plotting. So let's go ahead and extract country, variety, price, and points for our visualizations.

In [3]:
reviews_split = reviews.loc[:, ('country', 'variety', 'points', 'price')]

## 2. Scatter Plot

Let's start by getting a scatter plot of country VS the mean number of points it has scored to see which countries have the highest score.

In [4]:
import numpy as np

#melt the frame to get the mean of points per country
df_scatter1 = pd.pivot_table(reviews_split, values = 'points',
              index = 'country',
              aggfunc = np.mean).reset_index()

#order dataframe by points
df_scatter1.sort_values(by = 'points', inplace = True)

In [5]:
import plotly
import plotly.graph_objs as go

#plot data as scatter points
data = [go.Scatter(
          x=df_scatter1['country'],
          y=df_scatter1['points'],
          mode = 'markers')]

#Save the plot to html file
plotly.offline.plot(data, filename='scatter_country_VS_mean_points.html')

#plot data as connected line
data = [go.Scatter(
          x=df_scatter1['country'],
          y=df_scatter1['points'])]

#Save the plot to html file
plotly.offline.plot(data, filename='line_country_VS_mean_points.html')

'line_country_VS_mean_points.html'

Another meaningful graph would be the average price VS the points which might help indicate if the price is higher for wines with a higher score

In [None]:

#melt the dataframe so that each rank (total points) has a mean price
df_scatter2 = pd.pivot_table(reviews_split, values = 'price',
              index = 'points',
              aggfunc = np.mean).reset_index()

#plot data as scatter points
data = [go.Scatter(
          x=df_scatter2['points'],
          y=df_scatter2['price'])]

#Save the plot to html file
plotly.offline.plot(data, filename='line_price_VS_points.html')

## 3. Choropleth Mean Points VS Country

A choropleth graph requires geographical data. From the data provided, a choropleth can be used to plot points or price VS country. To plot mean point for each country, we can use the same frame we made for the first scatter plot.

In [None]:
#plotting mean price VS country
data = [go.Choropleth(
                locations = df_scatter1['country'],
                locationmode = 'country names',
                z = df_scatter1['points'],
                text = df_scatter1['country'],
                colorscale = 'Earth',
                marker_line_color='darkgray',
                marker_line_width=0.5,
                colorbar_tickprefix = '#',
                colorbar_title = 'Mean<br>Points',
        )]

#Save the plot to html file
plotly.offline.plot(data, filename='choropleth_country_VS_mean_points.html')

A dataframe has to be made for the mean price per country before we can plot.

In [None]:
#melt the dataframe to get mean price per country
df_choropleth = pd.pivot_table(reviews_split, values = 'price',
              index = 'country',
              aggfunc = np.mean).reset_index()

#order dataframe by points
df_choropleth.sort_values(by = 'price', inplace = True)

In [None]:
#plotting mean point VS country
data = [go.Choropleth(
                locations = df_choropleth['country'],
                locationmode = 'country names',
                z = df_choropleth['price'],
                text = df_choropleth['country'],
                colorscale = 'Greens',
                autocolorscale=False,
                marker_line_color='darkgray',
                marker_line_width=0.5,
                colorbar_tickprefix = '#',
                colorbar_title = 'Mean<br>Price',
        )]

#Save the plot to html file
plotly.offline.plot(data, filename='choropleth_country_VS_mean_price.html')

## 4. Surface Plot

Further investigation of the data is required to extract data for the surface plot.

In [66]:
reviews['country'].value_counts().head() #show the top 5 repeated countries in the data

US          54504
France      22093
Italy       19540
Spain        6645
Portugal     5691
Name: country, dtype: int64

In [67]:
reviews['variety'].value_counts().head() #show the top 5 repeated varieties in the data

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Name: variety, dtype: int64

Now we can extract more meaningful data by splitting the data and choosing only the top 5 countries ('US', 'France', 'Italy', 'Spain', 'Portugal') and the top 5 varieties ('Pinot Noir', 'Chardonnay', 'Cabernet Sauvignon', 'Red Bland', 'Bordeaux-style Red Blend').

In [81]:
#Extract the top 5 countries and top 5 varities
reviews_decreased = reviews_split.copy()

reviews_decreased = reviews_decreased[(
    (reviews_split['variety'] == 'Pinot Noir') | (reviews_split['variety'] == 'Chardonnay') | 
    (reviews_split['variety'] == 'Cabernet Sauvignon') | (reviews_split['variety'] == 'Red Blend') |
    (reviews_split['variety'] == 'Bordeaux-style Red Blend'))&
    ((reviews_split['country'] == 'US') | (reviews_split['country'] == 'France') |
     (reviews_split['country'] == 'Italy') | (reviews_split['country'] == 'Spain') |
     (reviews_split['country'] == 'Portugal'))]

reviews_decreased.isnull().sum()

country       0
variety       0
points        0
price      3095
dtype: int64

Note here that there are a lot of null values in the price column. We can drop those.

In [82]:
reviews_surface = reviews_decreased.copy()
# reviews_surface.dropna(axis = 0, inplace = True)

In [83]:
reviews_surface = pd.pivot_table(reviews_surface, values='price', 
                                 index=['points'], 
                                 columns='country',
                                aggfunc = 'mean')

reviews_surface

country,France,Italy,Portugal,Spain,US
points,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
80,16.8,,,14.666667,21.322581
81,13.473684,9.0,,24.611111,20.368421
82,16.076923,15.0,,22.185185,22.751579
83,18.037037,19.481481,,15.897436,22.877442
84,19.708122,17.181818,18.0,21.571429,25.390575
85,19.680672,21.427632,13.4,20.887324,26.998226
86,22.052542,23.169753,16.5,21.163265,30.539066
87,24.567059,25.854135,21.375,24.317308,32.290558
88,33.891529,30.275693,17.125,27.863309,36.629179
89,37.762737,35.816495,25.75,32.478723,38.114862


In [87]:
#Plot the surface plot with country, price, and points
data = [go.Surface(z = reviews_surface.loc[84:94].values,
                   x = reviews_surface.columns.tolist(),
                   y = reviews_surface.loc[84:94].index.to_list(),
                   colorscale = 'Viridis')]

#Save the plot to html file
plotly.offline.plot(data, filename='surface_country_price_points.html')

'surface_country_price_points.html'

In [86]:
reviews_surface.loc[84:94]

country,France,Italy,Portugal,Spain,US
points,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
84,19.708122,17.181818,18.0,21.571429,25.390575
85,19.680672,21.427632,13.4,20.887324,26.998226
86,22.052542,23.169753,16.5,21.163265,30.539066
87,24.567059,25.854135,21.375,24.317308,32.290558
88,33.891529,30.275693,17.125,27.863309,36.629179
89,37.762737,35.816495,25.75,32.478723,38.114862
90,40.818182,43.220648,32.444444,35.474227,44.840708
91,52.196018,50.41115,39.333333,48.202703,49.73101
92,69.499037,60.415771,19.0,59.77027,55.900281
93,95.164733,70.284024,30.333333,79.4,61.745463


## 5. Heatmap

In [9]:
#melt table to get mean points per country and variety
reviews_heatmap = pd.pivot_table(reviews_decreased, values='points', 
                                 index=['variety'], 
                                 columns='country')

In [15]:
#Plot the surface plot with country, price, and points
data = [go.Heatmap(z = reviews_heatmap.values,
                   x = reviews_heatmap.columns.tolist(),
                   y = reviews_heatmap.index.to_list(),
                   colorscale = 'Viridis')]

#Save the plot to html file
plotly.offline.plot(data, filename='heatmap_country_variety_points.html')

'heatmap_country_variety_points.html'