## Import libraries and Dataframe

In [27]:
import pandas as pd

In [28]:
df = pd.read_csv('scraping_style.csv')
df.shape

(35910, 11)

In [29]:
df

Unnamed: 0,wine_id,varietal_name,style_id,body,acidity_1,acidity_description,acidity_2,fizziness,intensity,sweetness,tannin
0,1420799,Tinto de Ribera del Duero,180,5,3,Alta,3.700298e+00,,3.837711e+00,1.782577e+00,3.415475e+00
1,1199830,Tinto de Ribera del Duero,180,5,3,Alta,3.700298e+00,,3.837711e+00,1.782577e+00,3.415475e+00
2,1494622,Tinto de Ribera del Duero,180,5,3,Alta,3.700298e+00,,3.837711e+00,1.782577e+00,3.415475e+00
3,1229417,Tinto de Ribera del Duero,180,5,3,Alta,3.700298e+00,,3.837711e+00,1.782577e+00,3.415475e+00
4,87412043,Tinto de Ribera del Duero,180,5,3,Alta,3.673210e+00,,3.852942e+00,1.838485e+00,3.483987e+00
...,...,...,...,...,...,...,...,...,...,...,...
35905,164784307,164784307,164784307,164784307,164784307,164784307,1.647843e+08,164784307.0,1.647843e+08,1.647843e+08,1.647843e+08
35906,14979215,14979215,14979215,14979215,14979215,14979215,1.497922e+07,14979215.0,1.497922e+07,1.497922e+07,1.497922e+07
35907,169911041,169911041,169911041,169911041,169911041,169911041,1.699110e+08,169911041.0,1.699110e+08,1.699110e+08,1.699110e+08
35908,14979215,14979215,14979215,14979215,14979215,14979215,1.497922e+07,14979215.0,1.497922e+07,1.497922e+07,1.497922e+07


## Drop rows that have not information (in the scraping fase they where empty)

In [30]:
rows_drop = df[df['wine_id'] == df['body']].index

In [31]:
cleaned = df.drop(rows_drop)
print(df.shape[0] - len(rows_drop))
cleaned.shape

31091


(31091, 11)

## Check NaNs

In [32]:
cleaned.isna().sum()

wine_id                    0
varietal_name              0
style_id                   0
body                       0
acidity_1                  0
acidity_description        0
acidity_2                  0
fizziness              30965
intensity                  0
sweetness                  1
tannin                 10993
dtype: int64

In [33]:
cleaned[cleaned['fizziness']==0]

Unnamed: 0,wine_id,varietal_name,style_id,body,acidity_1,acidity_description,acidity_2,fizziness,intensity,sweetness,tannin


### Fill NaNs with 0 because I think it is the missing value

In [34]:
update = cleaned.fillna(0)

## Round values 

In [35]:
def rounded(df,cols):
    for col in cols:
        df[col] = round(df[col],2)
    return df

In [36]:
cols = ['acidity_2','fizziness','intensity','sweetness','tannin']
update = rounded(update,cols)
update

Unnamed: 0,wine_id,varietal_name,style_id,body,acidity_1,acidity_description,acidity_2,fizziness,intensity,sweetness,tannin
0,1420799,Tinto de Ribera del Duero,180,5,3,Alta,3.70,0.0,3.84,1.78,3.42
1,1199830,Tinto de Ribera del Duero,180,5,3,Alta,3.70,0.0,3.84,1.78,3.42
2,1494622,Tinto de Ribera del Duero,180,5,3,Alta,3.70,0.0,3.84,1.78,3.42
3,1229417,Tinto de Ribera del Duero,180,5,3,Alta,3.70,0.0,3.84,1.78,3.42
4,87412043,Tinto de Ribera del Duero,180,5,3,Alta,3.67,0.0,3.85,1.84,3.48
...,...,...,...,...,...,...,...,...,...,...,...
35867,61243167,Riesling,186,2,3,Alta,4.19,0.0,2.64,1.89,0.00
35868,160177301,Grauburgunder,148,3,3,Alta,3.02,0.0,3.03,1.83,0.00
35869,61745791,Riesling,186,2,3,Alta,4.63,0.0,2.90,1.58,0.00
35870,169349840,Pinot Blanc,141,3,3,Alta,4.07,0.0,3.00,1.00,0.00


## Check and drop duplicates

In [37]:
update.duplicated().sum()

12793

In [38]:
update = update.drop_duplicates()
update.shape

(18298, 11)

In [39]:
update.duplicated(subset = ['wine_id']).sum()

0

## Check rows with missing information (i'll deal with them when importing from MySQL)

In [44]:
update[update['style_id']==update['body']]

Unnamed: 0,wine_id,varietal_name,style_id,body,acidity_1,acidity_description,acidity_2,fizziness,intensity,sweetness,tannin
21200,166990245,Pedro Ximenez,136,136,136,136,136.0,136.0,136.0,136.0,136.0
21809,156881629,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
21841,160211882,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
22008,169027400,Pedro Ximenez,136,136,136,136,136.0,136.0,136.0,136.0,136.0
22980,166331167,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
28331,3007729,Blanco,131,131,131,131,131.0,131.0,131.0,131.0,131.0
33549,164943039,Chardonnay,55,55,55,55,55.0,55.0,55.0,55.0,55.0


In [45]:
values = update[update['style_id']==update['body']]['wine_id'].values

In [46]:
update[update['wine_id'].isin(values)]

Unnamed: 0,wine_id,varietal_name,style_id,body,acidity_1,acidity_description,acidity_2,fizziness,intensity,sweetness,tannin
21200,166990245,Pedro Ximenez,136,136,136,136,136.0,136.0,136.0,136.0,136.0
21809,156881629,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
21841,160211882,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
22008,169027400,Pedro Ximenez,136,136,136,136,136.0,136.0,136.0,136.0,136.0
22980,166331167,Vino espumoso,216,216,216,216,216.0,216.0,216.0,216.0,216.0
28331,3007729,Blanco,131,131,131,131,131.0,131.0,131.0,131.0,131.0
33549,164943039,Chardonnay,55,55,55,55,55.0,55.0,55.0,55.0,55.0


## Save csv

In [24]:
update.to_csv('update_style.csv',index=False)

## Save MySQL

In [14]:
import pymysql
from sqlalchemy import create_engine
import getpass  # To get the password without showing the input
password = getpass.getpass()

········


In [15]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/clean_winery'
engine = create_engine(connection_string)

In [25]:
update.to_sql('style', con = engine, if_exists = 'replace', index = False )

18298