## Import libraries and dataframe

In [1]:
import pandas as pd

In [2]:
purchase = pd.read_csv('scraping_purchase.csv')
purchase.shape

(35777, 6)

In [3]:
purchase

Unnamed: 0,wine_id,price,currency,bottle_type_id,bottle_type,vol_ml
0,1420799,487.000000,EUR,1,botella,750
1,1199830,692.960000,EUR,1,botella,750
2,1494622,381.150000,EUR,1,botella,750
3,1229417,778.060000,EUR,1,botella,750
4,87412043,514.850000,EUR,1,botella,750
...,...,...,...,...,...,...
35772,164784307,20.390723,EUR,1,botella,750
35773,14979215,21.000000,EUR,1,botella,750
35774,169911041,14.010000,EUR,1,botella,750
35775,14979215,21.000000,EUR,1,botella,750


In [8]:
def cleaning_wine(df):
    print(df.shape)
    cleaned_df = df.drop_duplicates()
    print(cleaned_df.shape)
    display(cleaned_df[cleaned_df['wine_id'].duplicated()==True])
    display(cleaned_df.isna().sum())
    update_df = cleaned_df.dropna()
    print(update_df.shape)
    return update_df

In [9]:
update_purchase = cleaning_wine(purchase)

(35777, 6)
(20365, 6)


Unnamed: 0,wine_id,price,currency,bottle_type_id,bottle_type,vol_ml


wine_id           0
price             0
currency          0
bottle_type_id    0
bottle_type       0
vol_ml            0
dtype: int64

(20365, 6)


## Check duplicates

In [11]:
update_purchase.duplicated(subset = ['wine_id']).sum()

0

## Deal currency column: drop and change price name to price_eur

In [12]:
update_purchase['currency'].value_counts()

EUR    20365
Name: currency, dtype: int64

In [13]:
cleaned = update_purchase.drop(['currency'],axis=1)
cleaned.columns = ['wine_id','price_eur','bottle_type_id','bottle_type','vol_ml']
cleaned

Unnamed: 0,wine_id,price_eur,bottle_type_id,bottle_type,vol_ml
0,1420799,487.000000,1,botella,750
1,1199830,692.960000,1,botella,750
2,1494622,381.150000,1,botella,750
3,1229417,778.060000,1,botella,750
4,87412043,514.850000,1,botella,750
...,...,...,...,...,...
35755,168653611,21.070000,1,botella,750
35756,160624545,14.730923,1,botella,750
35757,164784307,20.390723,1,botella,750
35773,14979215,21.000000,1,botella,750


## Check columns with same information

In [14]:
cleaned['bottle_type'].value_counts()

botella             19905
mágnum                388
botella de ½           45
botella de litro       16
½ litro                11
Name: bottle_type, dtype: int64

In [15]:
cleaned['vol_ml'].value_counts()

750     19905
1500      388
375        45
1000       16
500        11
Name: vol_ml, dtype: int64

In [16]:
cleaned['bottle_type_id'].value_counts()

1     19905
4       388
3        45
2        16
12       11
Name: bottle_type_id, dtype: int64

## Droping columns with duplicated information

In [17]:
cleaned = cleaned.drop(['bottle_type','bottle_type_id'],axis=1)
cleaned

Unnamed: 0,wine_id,price_eur,vol_ml
0,1420799,487.000000,750
1,1199830,692.960000,750
2,1494622,381.150000,750
3,1229417,778.060000,750
4,87412043,514.850000,750
...,...,...,...
35755,168653611,21.070000,750
35756,160624545,14.730923,750
35757,164784307,20.390723,750
35773,14979215,21.000000,750


## Round price

In [18]:
cleaned['price_eur'] = round(cleaned['price_eur'],2)
cleaned

Unnamed: 0,wine_id,price_eur,vol_ml
0,1420799,487.00,750
1,1199830,692.96,750
2,1494622,381.15,750
3,1229417,778.06,750
4,87412043,514.85,750
...,...,...,...
35755,168653611,21.07,750
35756,160624545,14.73,750
35757,164784307,20.39,750
35773,14979215,21.00,750


## Save csv

In [19]:
cleaned.to_csv('update_purchase.csv',index=False)

## Save MySQL

In [20]:
import pymysql
from sqlalchemy import create_engine
import getpass  # To get the password without showing the input
password = getpass.getpass()

········


In [21]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/clean_winery'
engine = create_engine(connection_string)

In [22]:
cleaned.to_sql('purchase', con = engine, if_exists = 'replace', index = False )

20365