## Import libraries and dataframe

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('scraping_purchase.csv')
df.shape

(35935, 7)

In [3]:
df

Unnamed: 0.1,Unnamed: 0,wine_id,price,currency,bottle_type_id,bottle_type,vol_ml
0,0,1420799,487.000000,EUR,1,botella,750
1,1,1199830,692.960000,EUR,1,botella,750
2,2,1494622,381.150000,EUR,1,botella,750
3,3,1229417,778.060000,EUR,1,botella,750
4,4,87412043,514.850000,EUR,1,botella,750
...,...,...,...,...,...,...,...
35930,14,164784307,20.390723,EUR,1,botella,750
35931,0,14979215,21.000000,EUR,1,botella,750
35932,1,169911041,14.010000,EUR,1,botella,750
35933,0,14979215,21.000000,EUR,1,botella,750


## Drop unnamed column

In [4]:
def drop(df):
    df = df.drop(['Unnamed: 0'],axis=1)
    return df

In [5]:
purchase = drop(df)
purchase

Unnamed: 0,wine_id,price,currency,bottle_type_id,bottle_type,vol_ml
0,1420799,487.000000,EUR,1,botella,750
1,1199830,692.960000,EUR,1,botella,750
2,1494622,381.150000,EUR,1,botella,750
3,1229417,778.060000,EUR,1,botella,750
4,87412043,514.850000,EUR,1,botella,750
...,...,...,...,...,...,...
35930,164784307,20.390723,EUR,1,botella,750
35931,14979215,21.000000,EUR,1,botella,750
35932,169911041,14.010000,EUR,1,botella,750
35933,14979215,21.000000,EUR,1,botella,750


## Check NaN values

In [6]:
purchase.isna().sum()

wine_id           0
price             0
currency          0
bottle_type_id    0
bottle_type       0
vol_ml            0
dtype: int64

## Check duplicates

In [7]:
purchase.duplicated(subset = ['wine_id']).sum()

15487

## Drop duplicates

In [8]:
cleaned = purchase.drop_duplicates(subset = ['wine_id'])
cleaned.shape

(20448, 6)

## Deal currency column: drop and change price name to price_eur

In [9]:
cleaned['currency'].value_counts()

EUR    20448
Name: currency, dtype: int64

In [10]:
cleaned = cleaned.drop(['currency'],axis=1)
cleaned.columns = ['wine_id','price_eur','bottle_type_id','bottle_type','vol_ml']
cleaned

Unnamed: 0,wine_id,price_eur,bottle_type_id,bottle_type,vol_ml
0,1420799,487.000000,1,botella,750
1,1199830,692.960000,1,botella,750
2,1494622,381.150000,1,botella,750
3,1229417,778.060000,1,botella,750
4,87412043,514.850000,1,botella,750
...,...,...,...,...,...
35913,165754960,20.620000,1,botella,750
35914,160624545,14.730923,1,botella,750
35915,164784307,20.390723,1,botella,750
35931,14979215,21.000000,1,botella,750


## Check columns with same information

In [11]:
cleaned['bottle_type'].value_counts()

botella             19996
mágnum                381
botella de ½           45
botella de litro       15
½ litro                11
Name: bottle_type, dtype: int64

In [12]:
cleaned['vol_ml'].value_counts()

750     19996
1500      381
375        45
1000       15
500        11
Name: vol_ml, dtype: int64

In [13]:
cleaned['bottle_type_id'].value_counts()

1     19996
4       381
3        45
2        15
12       11
Name: bottle_type_id, dtype: int64

## Droping columns with duplicated information

In [14]:
cleaned = cleaned.drop(['bottle_type','bottle_type_id'],axis=1)
cleaned

Unnamed: 0,wine_id,price_eur,vol_ml
0,1420799,487.000000,750
1,1199830,692.960000,750
2,1494622,381.150000,750
3,1229417,778.060000,750
4,87412043,514.850000,750
...,...,...,...
35913,165754960,20.620000,750
35914,160624545,14.730923,750
35915,164784307,20.390723,750
35931,14979215,21.000000,750


## Round price

In [16]:
cleaned['price_eur'] = round(cleaned['price_eur'],2)
cleaned

Unnamed: 0,wine_id,price_eur,vol_ml
0,1420799,487.00,750
1,1199830,692.96,750
2,1494622,381.15,750
3,1229417,778.06,750
4,87412043,514.85,750
...,...,...,...
35913,165754960,20.62,750
35914,160624545,14.73,750
35915,164784307,20.39,750
35931,14979215,21.00,750
