# Preanálisis de los datos y pruebas para la limpieza de datos

In [1]:
# Importaciones
import pandas as pd
import numpy as np
import ast


#Seteo de opciones
pd.set_option("display.max_columns", None)

In [2]:
# Leemos el archivo
df_raw = pd.read_parquet("../data/data_raw/iowa_liquor_sales_2019.parquet")
df_raw.head()

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Gallons)
102,S08792800017,11/06/2019,2459,Reinhart Foods,200 STATE PO BOX 98\nGUTHRIE CENTER 50115\n,39.0,Guthrie,1062300.0,FLAVORED RUM,65.0,Jim Beam Brands,44499,Cruzan Mango Rum,12,750,$6.82,$10.24,3,$30.72,0.59
103,S04854100048,04/03/2019,2487,Anamosa Family Foods,"402 EAST MAIN\nANAMOSA 52205\n(42.108289, -91....",53.0,Jones,1081300.0,PEPPERMINT SCHNAPPS,434.0,Luxco-St Louis,80578,Arrow Peppermint Schnapps,6,1750,$7.04,$10.56,3,$31.68,1.39
106,S08900500035,11/13/2019,4509,A J'S LIQUOR II,"2515 CHAMBERLAIN\nAMES 50010\n(42.02146, -93.6...",85.0,Story,1081900.0,MISC. AMERICAN CORDIALS & LIQUEURS,322.0,Prestige Wine and Spirits Group,75210,Kinky Liqueur,6,750,$10.00,$15.00,12,$180.00,2.38
110,S06855800032,07/31/2019,2522,Hy-Vee Wine and Spirits / Spirit Lak,HWY 9 &amp; 71\nSPIRIT LAKE 51360\n,30.0,Dickinson,1062200.0,PUERTO RICO & VIRGIN ISLANDS RUM,35.0,"Bacardi U.S.A., Inc.",43125,Bacardi Superior Rum Pet,12,750,$7.53,$11.30,12,$135.60,2.38
113,S08372100023,10/16/2019,3816,Swils,"200 E OAK ST\nRED OAK 51566\n(41.016691, -95.2...",69.0,Montgomery,1031200.0,VODKA FLAVORED,380.0,Phillips Beverage Company,41693,Uv Blue (raspberry) Vodka,12,750,$6.25,$9.49,12,$113.88,2.38


In [3]:
# Primero una copia del df por si acaso
df = df_raw
print(f"El dataset esta formado por {df.shape[0]} filas y {df.shape[1]} columnas")

El dataset esta formado por 2073616 filas y 20 columnas


In [4]:
# Convertimos el nombre de las col en snake_case
df.columns = df.columns.str.replace(" ","_").str.lower().str.replace("_(", "(")
df.sample(3)


Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons)
6194703,S04702800075,03/22/2019,2633,Hy-Vee #3 / BDI / Des Moines,"3221 SE 14TH ST\nDES MOINES 50320\n(41.554101,...",77.0,Polk,1031080.0,VODKA 80 PROOF,461.0,Campari(skyy),37987,Skyy Vodka,12,1000,$12.34,$18.52,12,$222.24,3.17
2137096,S07110200093,08/14/2019,2959,Dahl's / Merle Hay,4343 MERLE HAY ROAD\nDES MOINES 50310\n(41.637...,77.0,Polk,1081330.0,PEACH SCHNAPPS,65.0,Jim Beam Brands,82846,Dekuyper Luscious Peachtree Schnapps,12,750,$6.30,$9.45,3,$28.35,0.59
2151061,S08836600001,11/08/2019,4176,Todds On The Go,235 EDGEWOOD RD NE\nCEDAR RAPIDS 52405\n,57.0,Linn,1031080.0,VODKA 80 PROOF,297.0,Laird And Company,35918,Five O'clock Vodka,6,1750,$7.20,$10.79,12,$129.48,5.55


In [5]:
# Cambiamos el nombre a la columna category 
df.rename(columns={"category":"category_number"}, inplace=True)

In [6]:
df.columns

Index(['invoice/item_number', 'date', 'store_number', 'store_name',
       'store_location', 'county_number', 'county', 'category_number',
       'category_name', 'vendor_number', 'vendor_name', 'item_number',
       'item_description', 'pack', 'bottle_volume(ml)', 'state_bottle_cost',
       'state_bottle_retail', 'bottles_sold', 'sale(dollars)',
       'volume_sold(gallons)'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2073616 entries, 102 to 8197504
Data columns (total 20 columns):
 #   Column                Dtype  
---  ------                -----  
 0   invoice/item_number   object 
 1   date                  object 
 2   store_number          int64  
 3   store_name            object 
 4   store_location        object 
 5   county_number         float64
 6   county                object 
 7   category_number       float64
 8   category_name         object 
 9   vendor_number         float64
 10  vendor_name           object 
 11  item_number           int64  
 12  item_description      object 
 13  pack                  int64  
 14  bottle_volume(ml)     int64  
 15  state_bottle_cost     object 
 16  state_bottle_retail   object 
 17  bottles_sold          int64  
 18  sale(dollars)         object 
 19  volume_sold(gallons)  float64
dtypes: float64(4), int64(5), object(11)
memory usage: 332.2+ MB


## Transformación de datos

### Columna de fecha

In [8]:
# Convertimos a formato datetime
df["date"] = pd.to_datetime(df["date"],format = "%m/%d/%Y" )                                   

### Columnas float a integers: las relacionadas con el numero

In [9]:
df["county_number"].unique()

array([39., 53., 85., 30., 69., 57., 82., 96., 77., 31., 78., 52.,  9.,
       64., 49., 29., 34., 48., 63., 75., 97., 22., 46., 91.,  7., 70.,
       94., 10., 11., 56., 17., 71., 23., 50., 14., 67., 37., 86., 88.,
       40., 59., 51., 33., nan, 38., 72., 16., 21., 42., 35.,  3., 47.,
       90., 24.,  4., 95.,  6., 55., 20., 99., 32.,  5., 81., 45., 62.,
       25., 83., 60., 54., 41., 18., 84., 92., 28., 93.,  1., 44., 13.,
       58., 66.,  8., 15., 76., 79., 74., 19., 43., 12.,  2., 61., 73.,
       80., 65., 68., 87., 27., 36., 98., 89., 26.])

In [10]:
df["category_number"].unique()

array([1062300., 1081300., 1081900., 1062200., 1031200., 1012100.,
       1042100., 1701100., 1011100., 1041100., 1062310., 1082900.,
       1032200., 1011200., 1032080., 1071100., 1031080., 1031100.,
       1011300., 1052010., 1081600., 1012200., 1101100., 1081200.,
       1051110., 1051100., 1011500., 1022100., 1062050., 1051010.,
       1081370., 1051150., 1081400., 1081365., 1081340., 1081390.,
       1081330., 1081240., 1051120., 1081315., 1012210., 1081015.,
       1081305., 1081317., 1081312., 1081350., 1081335., 1062100.,
       1041150., 1081030., 1081220., 1012300., 1081380., 1081500.,
       1081010., 1081355., 1081700., 1031110., 1011250., 1031090.,
       1081230., 1041200., 1081250., 1051140., 1011400., 1022200.,
       1081210.,      nan, 1081020., 1082010., 1062250., 1032100.,
       1032230., 1082015., 1501100., 1082390.])

In [11]:
df["vendor_number"].unique()

array([ 65., 434., 322.,  35., 380., 260., 370., 192.,  55.,  85., 297.,
       389., 115., 451., 305., 395., 255., 461., 330., 420., 259., 410.,
       421., 885., 300., 205., 308., 240., 306., 357., 971., 294., 130.,
       390., 969., 301., 125., 277., 375., 267., 384., 418., 394., 365.,
       190., 285., 184., 338., 350., 492., 391., 198., 469.,  86., 160.,
       460., 962., 377., 978., 483., 239., 977., 250., 497., 495., 346.,
       399.,  89.,  10., 459., 295., 293., 154., 298., 158.,  91., 465.,
       415., 503.,  79., 369., 363., 987., 143., 307., 185., 224., 109.,
       211., 310., 379., 477.,  90., 108., 313.,  61., 278., 501., 251.,
        80., 311., 482., 456., 137., 361., 475., 287., 488.,  27., 485.,
       803.,  68., 432., 263., 155.,  69., 343., 161., 486.,  51.])

### Despues de comprobar los valores unicos, vemos que no tiene sentido que los valores sean float, deberían ser int. Pasamos a transformarlos en la columna que podemos porque no tiene nulos

In [12]:
# df["county_number"] = df["county_number"].astype(int) ---no funciona porque hay valores Nan
#df["county_number"] = df["county_number"].apply(lambda x: int(x) if pd.notnull(x) else np.nan)--- tampoco funciona

In [13]:
df["vendor_number"] = df["vendor_number"].astype(int)

In [14]:
df["vendor_number"].unique()

array([ 65, 434, 322,  35, 380, 260, 370, 192,  55,  85, 297, 389, 115,
       451, 305, 395, 255, 461, 330, 420, 259, 410, 421, 885, 300, 205,
       308, 240, 306, 357, 971, 294, 130, 390, 969, 301, 125, 277, 375,
       267, 384, 418, 394, 365, 190, 285, 184, 338, 350, 492, 391, 198,
       469,  86, 160, 460, 962, 377, 978, 483, 239, 977, 250, 497, 495,
       346, 399,  89,  10, 459, 295, 293, 154, 298, 158,  91, 465, 415,
       503,  79, 369, 363, 987, 143, 307, 185, 224, 109, 211, 310, 379,
       477,  90, 108, 313,  61, 278, 501, 251,  80, 311, 482, 456, 137,
       361, 475, 287, 488,  27, 485, 803,  68, 432, 263, 155,  69, 343,
       161, 486,  51])

### Transformar los valores de las columnas de precio :
- `state_bottle_cost`
- `state_bottle_retail`
- `sale(dollars)`

In [15]:
precio_limpiar = df["state_bottle_cost"].iloc[0]
precio_limpiar

'$6.82'

In [16]:
#df["state_bottle_cost"]=df["state_bottle_cost"].str.replace("$", "").astype(float)

In [17]:
def dollars_float(dataframe,col):
     dataframe[col] = dataframe[col].str.replace("$", "").astype(float)
     return dataframe

In [18]:
lista_col = ["state_bottle_cost", "state_bottle_retail", "sale(dollars)"]
for col in lista_col:
     dollars_float(df,col)

### Descomposición de la columna `store_location`

In [19]:
valor_store_location = df["store_location"].iloc[1]
valor_store_location

'402 EAST MAIN\nANAMOSA 52205\n(42.108289, -91.281881)'

In [20]:
valor = df["store_location"].iloc[1]
valor

'402 EAST MAIN\nANAMOSA 52205\n(42.108289, -91.281881)'

In [21]:
# Creamos una lista con cada valor
lista_valores = valor.split("\n")
lista_valores

['402 EAST MAIN', 'ANAMOSA 52205', '(42.108289, -91.281881)']

In [22]:
addres = lista_valores[0]
addres

'402 EAST MAIN'

In [23]:
city = " ".join(lista_valores[1].split()[:-1]) 
city

'ANAMOSA'

In [24]:
zip_code =lista_valores[1].split()[-1]
zip_code

'52205'

In [25]:
latitud_longitud =lista_valores[-1]
latitud_longitud

'(42.108289, -91.281881)'

In [26]:
latitude, longitude = ast.literal_eval(lista_valores[2])

In [27]:
latitude

42.108289

In [28]:
longitude

-91.281881

In [29]:
# lista_valores = valor.split("\n")
# lista_valores = valor.split("\n")
# addres = lista_valores[0]
# city = " ".join(lista_valores[1].split()[:-1]) 
# zip_code =lista_valores[1].split()[-1]
# latitud_longitud =lista_valores[-1]
# latitude, longitude = ast.literal_eval(lista_valores[2])


In [30]:
# Ahora creamos la funcion:

def extract_location(valor): # valor es el valor de una columna del dataframe
     try:
          split_data = valor.split("\n")
          address = split_data[0] if len(split_data) > 0 else np.nan
          city = " ".join(split_data[1].split()[:-1]) if len(split_data) > 1 else np.nan
          zip_code =split_data[1].split()[-1] if len(split_data) > 1 else np.nan
          if len (split_data) > 2:
               try:
                    latitude, longitude = ast.literal_eval(split_data[2])
               except(SyntaxError, ValueError, TypeError):
                    latitude, longitude = np.nan, np.nan
          else:
               latitude, longitude = np.nan, np.nan
          return pd.Series([address,
                         city,
                         zip_code,
                         latitude,
                         longitude])
     except(IndexError, ValueError):
          return pd.Series([np.nan,
                           np.nan,
                           np.nan,
                           np.nan,
                           np.nan])
          
          

In [31]:
df[["address",
     "city",
     "zip_code",
     "latitude",
     "longitude"]] = df["store_location"].apply(extract_location)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2073616 entries, 102 to 8197504
Data columns (total 26 columns):
 #   Column                Dtype         
---  ------                -----         
 0   invoice/item_number   object        
 1   date                  datetime64[ns]
 2   store_number          int64         
 3   store_name            object        
 4   store_location        object        
 5   county_number         float64       
 6   county                object        
 7   category_number       float64       
 8   category_name         object        
 9   vendor_number         int64         
 10  vendor_name           object        
 11  item_number           int64         
 12  item_description      object        
 13  pack                  int64         
 14  bottle_volume(ml)     int64         
 15  state_bottle_cost     float64       
 16  state_bottle_retail   float64       
 17  bottles_sold          int64         
 18  sale(dollars)         float64       
 19  vol

## Limpieza de store_name

In [42]:
nombre_tienda = df["store_name"].iloc[3]
nombre_tienda

'Hy-Vee Wine and Spirits'

In [43]:
nombre_limpio = nombre_tienda.split("/")[0].strip()
nombre_limpio

'Hy-Vee Wine and Spirits'

In [44]:
def clean_store_name(valor):
     if "/" in valor:
          return valor.split("/")[0].strip()
     return valor

In [45]:
df["store_name"] = df["store_name"].apply(clean_store_name)
df.head()

Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category_number,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons),address,city,zip_code,latitude,longitude,volume_sold(ml)
102,S08792800017,2019-11-06,2459,Reinhart Foods,200 STATE PO BOX 98\nGUTHRIE CENTER 50115\n,39.0,Guthrie,1062300.0,FLAVORED RUM,65,Jim Beam Brands,44499,Cruzan Mango Rum,12,750,6.82,10.24,3,30.72,0.59,200 STATE PO BOX 98,GUTHRIE CENTER,50115,,,2233.3919
103,S04854100048,2019-04-03,2487,Anamosa Family Foods,"402 EAST MAIN\nANAMOSA 52205\n(42.108289, -91....",53.0,Jones,1081300.0,PEPPERMINT SCHNAPPS,434,Luxco-St Louis,80578,Arrow Peppermint Schnapps,6,1750,7.04,10.56,3,31.68,1.39,402 EAST MAIN,ANAMOSA,52205,42.108289,-91.281881,5261.7199
106,S08900500035,2019-11-13,4509,A J'S LIQUOR II,"2515 CHAMBERLAIN\nAMES 50010\n(42.02146, -93.6...",85.0,Story,1081900.0,MISC. AMERICAN CORDIALS & LIQUEURS,322,Prestige Wine and Spirits Group,75210,Kinky Liqueur,6,750,10.0,15.0,12,180.0,2.38,2515 CHAMBERLAIN,AMES,50010,42.02146,-93.650965,9009.2758
110,S06855800032,2019-07-31,2522,Hy-Vee Wine and Spirits,HWY 9 &amp; 71\nSPIRIT LAKE 51360\n,30.0,Dickinson,1062200.0,PUERTO RICO & VIRGIN ISLANDS RUM,35,"Bacardi U.S.A., Inc.",43125,Bacardi Superior Rum Pet,12,750,7.53,11.3,12,135.6,2.38,HWY 9 &amp; 71,SPIRIT LAKE,51360,,,9009.2758
113,S08372100023,2019-10-16,3816,Swils,"200 E OAK ST\nRED OAK 51566\n(41.016691, -95.2...",69.0,Montgomery,1031200.0,VODKA FLAVORED,380,Phillips Beverage Company,41693,Uv Blue (raspberry) Vodka,12,750,6.25,9.49,12,113.88,2.38,200 E OAK ST,RED OAK,51566,41.016691,-95.230032,9009.2758


## Normalizar las columnas de volumen a mililitros


In [46]:
df["volume_sold(ml)"] = df["volume_sold(gallons)"] * 3785.41

In [47]:
df.head()

Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category_number,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons),address,city,zip_code,latitude,longitude,volume_sold(ml)
102,S08792800017,2019-11-06,2459,Reinhart Foods,200 STATE PO BOX 98\nGUTHRIE CENTER 50115\n,39.0,Guthrie,1062300.0,FLAVORED RUM,65,Jim Beam Brands,44499,Cruzan Mango Rum,12,750,6.82,10.24,3,30.72,0.59,200 STATE PO BOX 98,GUTHRIE CENTER,50115,,,2233.3919
103,S04854100048,2019-04-03,2487,Anamosa Family Foods,"402 EAST MAIN\nANAMOSA 52205\n(42.108289, -91....",53.0,Jones,1081300.0,PEPPERMINT SCHNAPPS,434,Luxco-St Louis,80578,Arrow Peppermint Schnapps,6,1750,7.04,10.56,3,31.68,1.39,402 EAST MAIN,ANAMOSA,52205,42.108289,-91.281881,5261.7199
106,S08900500035,2019-11-13,4509,A J'S LIQUOR II,"2515 CHAMBERLAIN\nAMES 50010\n(42.02146, -93.6...",85.0,Story,1081900.0,MISC. AMERICAN CORDIALS & LIQUEURS,322,Prestige Wine and Spirits Group,75210,Kinky Liqueur,6,750,10.0,15.0,12,180.0,2.38,2515 CHAMBERLAIN,AMES,50010,42.02146,-93.650965,9009.2758
110,S06855800032,2019-07-31,2522,Hy-Vee Wine and Spirits,HWY 9 &amp; 71\nSPIRIT LAKE 51360\n,30.0,Dickinson,1062200.0,PUERTO RICO & VIRGIN ISLANDS RUM,35,"Bacardi U.S.A., Inc.",43125,Bacardi Superior Rum Pet,12,750,7.53,11.3,12,135.6,2.38,HWY 9 &amp; 71,SPIRIT LAKE,51360,,,9009.2758
113,S08372100023,2019-10-16,3816,Swils,"200 E OAK ST\nRED OAK 51566\n(41.016691, -95.2...",69.0,Montgomery,1031200.0,VODKA FLAVORED,380,Phillips Beverage Company,41693,Uv Blue (raspberry) Vodka,12,750,6.25,9.49,12,113.88,2.38,200 E OAK ST,RED OAK,51566,41.016691,-95.230032,9009.2758


## Análisis de nulos

In [48]:
def null_values(dataframe):
     count_null = dataframe.isnull().sum()
     percen_null = (count_null / dataframe.shape[0])
     df_null = pd.DataFrame({
          "Null Values": count_null,
          "Null percen": percen_null  
     })
     return df_null

In [49]:
df_null = null_values(df)
df_null

Unnamed: 0,Null Values,Null percen
invoice/item_number,0,0.0
date,0,0.0
store_number,0,0.0
store_name,0,0.0
store_location,0,0.0
county_number,3599,0.001736
county,3599,0.001736
category_number,284,0.000137
category_name,818,0.000394
vendor_number,0,0.0


## ¿ Cómo tratamos los nulos ?


## `county_number` y `county` 

In [54]:
df_nulos_county = df[df["county"].isna()].copy()
df_nulos_county


Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category_number,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons),address,city,zip_code,latitude,longitude,volume_sold(ml)
692,S09377500017,2019-12-10,3782,Bender Foods,"619 S HWY 52\nGUTTENBERG 52052\n(42.77964, -91...",,,1081240.0,GREEN CREME DE MENTHE,434,Luxco-St Louis,79026,Arrow Creme De Menthe Green,12,750,4.50,6.75,3,20.25,0.59,619 S HWY 52,GUTTENBERG,52052,42.779640,-91.100489,2233.3919
7072,S07061900003,2019-08-13,3782,Bender Foods,"619 S HWY 52\nGUTTENBERG 52052\n(42.77964, -91...",,,1011200.0,STRAIGHT BOURBON WHISKIES,65,Jim Beam Brands,20248,Old Crow,6,1750,8.92,13.38,6,80.28,2.77,619 S HWY 52,GUTTENBERG,52052,42.779640,-91.100489,10485.5857
7129,S03833700020,2019-01-31,4247,Fareway Stores #879,"115 SECOND AVE SE\nBELMOND 50421\n(42.845719, ...",,,1082900.0,MISC. IMPORTED CORDIALS & LIQUEURS,192,Sidney Frank Importing Co.,65258,Jagermeister Liqueur,6,1750,25.69,38.53,1,38.53,0.46,115 SECOND AVE SE,BELMOND,50421,42.845719,-93.614324,1741.2886
7622,S05836300021,2019-06-04,4247,Fareway Stores #879,"115 SECOND AVE SE\nBELMOND 50421\n(42.845719, ...",,,1012100.0,CANADIAN WHISKIES,971,"Hood River Distillers, Inc.",14192,Pendleton Canadian Whisky,12,750,13.10,19.65,1,19.65,0.20,115 SECOND AVE SE,BELMOND,50421,42.845719,-93.614324,757.0820
13146,S09257900003,2019-12-03,3782,Bender Foods,"619 S HWY 52\nGUTTENBERG 52052\n(42.77964, -91...",,,1012100.0,CANADIAN WHISKIES,115,"Constellation Wine Company, Inc.",11776,Black Velvet,12,750,5.23,7.84,12,94.08,2.38,619 S HWY 52,GUTTENBERG,52052,42.779640,-91.100489,9009.2758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8190238,S09369700018,2019-12-10,4247,Fareway Stores #879,"115 SECOND AVE SE\nBELMOND 50421\n(42.845719, ...",,,1012200.0,SCOTCH WHISKIES,35,"Bacardi U.S.A., Inc.",4866,Dewars White Label Scotch,12,750,12.73,19.59,2,39.18,0.40,115 SECOND AVE SE,BELMOND,50421,42.845719,-93.614324,1514.1640
8190243,S06692900022,2019-07-23,4247,Fareway Stores #879,"115 SECOND AVE SE\nBELMOND 50421\n(42.845719, ...",,,1031200.0,VODKA FLAVORED,380,Phillips Beverage Company,41693,Uv Blue (raspberry) Vodka,12,750,6.50,9.74,3,29.22,0.59,115 SECOND AVE SE,BELMOND,50421,42.845719,-93.614324,2233.3919
8191172,S03712400018,2019-01-24,4247,Fareway Stores #879,"115 SECOND AVE SE\nBELMOND 50421\n(42.845719, ...",,,1071100.0,AMERICAN COCKTAILS,260,Diageo Americas,63755,Tgi Fridays Mudslide,6,1750,7.94,11.91,3,35.73,1.39,115 SECOND AVE SE,BELMOND,50421,42.845719,-93.614324,5261.7199
8193883,S08014400026,2019-10-01,3782,Bender Foods,"619 S HWY 52\nGUTTENBERG 52052\n(42.77964, -91...",,,1062200.0,PUERTO RICO & VIRGIN ISLANDS RUM,35,"Bacardi U.S.A., Inc.",43126,Bacardi Superior Rum,12,750,7.53,11.30,3,33.90,0.59,619 S HWY 52,GUTTENBERG,52052,42.779640,-91.100489,2233.3919


In [55]:
df_nulos_county["city"].unique()

array(['GUTTENBERG', 'BELMOND'], dtype=object)

Vemos que en los valores nulos de county y county number solo nos faltan los valores de dos ciudades concretas:
- 'Guttenberg'
- 'Belmond'
Haciendo una búsqueda rápida en internet vemos que pertenecen a los estados de Clyton y Wright respectivamente

In [57]:
df["county"].unique()

array(['Guthrie', 'Jones', 'Story', 'Dickinson', 'Montgomery', 'Linn',
       'Scott', 'Winneshiek', 'Polk', 'Dubuque', 'Pottawattamie',
       'Johnson', 'Bremer', 'Marshall', 'Jackson', 'Des Moines', 'Floyd',
       'Iowa', 'Marion', 'Plymouth', 'Woodbury', 'Clayton', 'Humboldt',
       'Warren', 'Black Hawk', 'Muscatine', 'Webster', 'Buchanan',
       'Buena Vista', 'Lee', 'Cerro Gordo', "O'Brien", 'Clinton',
       'Jasper', 'Carroll', 'Monona', 'Greene', 'Tama', 'Union',
       'Hamilton', 'Lucas', 'Jefferson', 'Fayette', None, 'Grundy',
       'Osceola', 'Cedar', 'Clay', 'Hardin', 'Franklin', 'Allamakee',
       'Ida', 'Wapello', 'Crawford', 'Appanoose', 'Winnebago', 'Benton',
       'Kossuth', 'Clarke', 'Wright', 'Emmet', 'Audubon', 'Sac', 'Howard',
       'Mahaska', 'Dallas', 'Shelby', 'Lyon', 'Keokuk', 'Hancock',
       'Cherokee', 'Sioux', 'Washington', 'Delaware', 'Wayne', 'Adair',
       'Henry', 'Calhoun', 'Louisa', 'Mitchell', 'Boone', 'Cass',
       'Pocahontas', 'Powesh

Vemos que tenemos los datos de esos dos condados para otras ciudades, es decir , conocemos el nombre y el número, rellenaremos los nulos con esos valores

In [60]:
df[df["county"] == "Clayton"]

Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category_number,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons),address,city,zip_code,latitude,longitude,volume_sold(ml)
221,S03688600022,2019-01-23,3679,FRANKLIN STREET FLORAL & GIFT,"103 FRANKLIN ST\nMONONA 52159\n(43.055133, -91...",22.0,Clayton,1012200.0,SCOTCH WHISKIES,260,Diageo Americas,5347,Johnnie Walker Red Label Scotch,12,1000,16.99,25.49,1,25.49,0.26,103 FRANKLIN ST,MONONA,52159,43.055133,-91.390767,984.2066
3887,S07362800038,2019-08-28,2656,Hy-Vee Food Store,"300 10TH ST\nCORNING 51632\n(40.991951, -94.74...",22.0,Clayton,1031200.0,VODKA FLAVORED,380,Phillips Beverage Company,42079,UV Cake Vodka,12,1000,7.50,11.25,1,11.25,0.26,300 10TH ST,CORNING,51632,40.991951,-94.743093,984.2066
5496,S05623500009,2019-05-21,4672,The Pit Stop,"402 S HWY 52\nGUTTENBERG 52052\n(42.782424, -9...",22.0,Clayton,1011300.0,TENNESSEE WHISKIES,85,Brown-Forman Corporation,26827,Jack Daniels Old #7 Black Lbl,12,1000,16.92,25.37,6,152.22,1.59,402 S HWY 52,GUTTENBERG,52052,42.782424,-91.100471,6018.8019
5658,S05734600038,2019-05-25,3679,FRANKLIN STREET FLORAL & GIFT,"103 FRANKLIN ST\nMONONA 52159\n(43.055133, -91...",22.0,Clayton,1042100.0,IMPORTED DRY GINS,260,Diageo Americas,28867,Tanqueray Gin,12,1000,14.99,22.48,3,67.44,0.79,103 FRANKLIN ST,MONONA,52159,43.055133,-91.390767,2990.4739
5747,S04760700026,2019-03-27,3957,Karl's Grocery Store,"101 WEST MADISON\nEDGEWOOD 52042\n(42.64539, -...",22.0,Clayton,1031080.0,VODKA 80 PROOF,259,Heaven Hill Brands,35416,Burnett's Vodka 80 Prf,12,750,4.55,6.82,2,13.64,0.40,101 WEST MADISON,EDGEWOOD,52042,42.645390,-91.401173,1514.1640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195664,S09407600038,2019-12-11,2656,Hy-Vee Food Store,"300 10TH ST\nCORNING 51632\n(40.991951, -94.74...",22.0,Clayton,1012100.0,CANADIAN WHISKIES,65,Jim Beam Brands,10627,Canadian Club Whisky,12,1000,9.71,14.56,1,14.56,0.26,300 10TH ST,CORNING,51632,40.991951,-94.743093,984.2066
8196741,S08468300031,2019-10-22,4033,"Mcgregor's Top Shelf, Llc","221 MAIN ST\nMCGREGOR 52157\n(43.023979, -91.1...",22.0,Clayton,1051010.0,AMERICAN GRAPE BRANDIES,205,E AND J GALLO WINERY,52591,E & J Vs Brandy Mini,12,500,5.91,8.87,2,17.74,0.26,221 MAIN ST,MCGREGOR,52157,43.023979,-91.176172,984.2066
8196763,S06711500030,2019-07-23,3679,FRANKLIN STREET FLORAL & GIFT,"103 FRANKLIN ST\nMONONA 52159\n(43.055133, -91...",22.0,Clayton,1011200.0,STRAIGHT BOURBON WHISKIES,461,Campari(skyy),22158,Wild Turkey 101,6,1750,23.96,35.95,1,35.95,0.46,103 FRANKLIN ST,MONONA,52159,43.055133,-91.390767,1741.2886
8196961,S08013900013,2019-10-01,4033,"Mcgregor's Top Shelf, Llc","221 MAIN ST\nMCGREGOR 52157\n(43.023979, -91.1...",22.0,Clayton,1031080.0,VODKA 80 PROOF,260,Diageo Americas,37998,Smirnoff Vodka 80 Prf,6,1750,13.50,20.74,2,41.48,0.92,221 MAIN ST,MCGREGOR,52157,43.023979,-91.176172,3482.5772


## `latitude` y `longitude`

In [61]:
df_nulos_latitud = df[df["latitude"].isna()].copy()
df_nulos_latitud

Unnamed: 0,invoice/item_number,date,store_number,store_name,store_location,county_number,county,category_number,category_name,vendor_number,vendor_name,item_number,item_description,pack,bottle_volume(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale(dollars),volume_sold(gallons),address,city,zip_code,latitude,longitude,volume_sold(ml)
102,S08792800017,2019-11-06,2459,Reinhart Foods,200 STATE PO BOX 98\nGUTHRIE CENTER 50115\n,39.0,Guthrie,1062300.0,FLAVORED RUM,65,Jim Beam Brands,44499,Cruzan Mango Rum,12,750,6.82,10.24,3,30.72,0.59,200 STATE PO BOX 98,GUTHRIE CENTER,50115,,,2233.3919
110,S06855800032,2019-07-31,2522,Hy-Vee Wine and Spirits,HWY 9 &amp; 71\nSPIRIT LAKE 51360\n,30.0,Dickinson,1062200.0,PUERTO RICO & VIRGIN ISLANDS RUM,35,"Bacardi U.S.A., Inc.",43125,Bacardi Superior Rum Pet,12,750,7.53,11.30,12,135.60,2.38,HWY 9 &amp; 71,SPIRIT LAKE,51360,,,9009.2758
185,S05233900074,2019-04-25,3417,Big G Food Store,PO BOX 261 310 W DILLON\nMARENGO 52301\n,48.0,Iowa,1041100.0,AMERICAN DRY GINS,434,Luxco-St Louis,31666,Paramount Gin Traveler,12,750,3.39,5.09,2,10.18,0.40,PO BOX 261 310 W DILLON,MARENGO,52301,,,1514.1640
199,S05796100019,2019-05-30,3660,Wal-Mart 2935,814 W BELL AVE\nKNOXVILLE 50138\n,63.0,Marion,1101100.0,AMERICAN ALCOHOL,434,Luxco-St Louis,41846,Everclear Alcohol,12,750,8.25,12.38,6,74.28,1.19,814 W BELL AVE,KNOXVILLE,50138,,,4504.6379
251,S06245900192,2019-06-26,2524,Hy-Vee Food Store,3500 DODGE ST\nDUBUQUE 52001\n,31.0,Dubuque,1032080.0,IMPORTED VODKA,395,Proximo,34935,Three Olives Vodka,12,750,10.16,15.74,3,47.22,0.59,3500 DODGE ST,DUBUQUE,52001,,,2233.3919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8197287,S05926100025,2019-06-06,3615,Emmetsburg Food Pride,1307 S BROADWAY\nEMMETSBURG 50536\n,74.0,Palo Alto,1062300.0,FLAVORED RUM,370,Pernod Ricard USA/Austin Nichols,42676,Malibu Passion Fruit Rum,12,750,6.90,10.35,3,31.05,0.59,1307 S BROADWAY,EMMETSBURG,50536,,,2233.3919
8197325,S06781800074,2019-07-25,2536,HY-VEE,4815 MAPLE DRIVE\nPLEASANT HILL 50317\n,77.0,Polk,1052010.0,IMPORTED GRAPE BRANDIES,420,"Moet Hennessy USA, Inc.",48105,Hennessy VS,12,375,8.75,13.13,1,13.13,0.10,4815 MAPLE DRIVE,PLEASANT HILL,50317,,,378.5410
8197335,S08660200023,2019-10-30,2607,Hy-Vee Wine and Spirits,520 SO FREMONT\nSHENANDOAH 51601\n,73.0,Page,1011100.0,BLENDED WHISKIES,65,Jim Beam Brands,24458,Kessler Blend Whiskey,6,1750,10.01,15.52,6,93.12,2.77,520 SO FREMONT,SHENANDOAH,51601,,,10485.5857
8197347,S07048400007,2019-08-13,4520,Casey's General Store #2842,902 N HWY 69\nHUXLEY 50124\n,85.0,Story,1011100.0,BLENDED WHISKIES,260,Diageo Americas,25606,Seagrams 7 Crown Bl Whiskey,12,750,6.53,9.79,12,117.48,2.38,902 N HWY 69,HUXLEY,50124,,,9009.2758



- Vemos que no llegamos a ningún acercamiento plausible para rellenar esos valores de latitud y longitud de una  manera rápida y eficiente
- La forma que se nos ocurre requeriría un esfuerzo y  una inversión de tiempo adicional

In [40]:
df.to_parquet("../data/data_transform/dataset_cleaned.parquet")