# Tecnicas de Limpieza y Transformacion de datos

## Polars

In [1]:
import polars as pl
import os

path_data = os.path.join(os.getcwd(), 'data')
path_file = lambda x: os.path.join(path_data, x)

In [2]:
sales_data = pl.read_csv(path_file('sales.csv'))
sales_data.head(3)

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
i64,str,f64,i64,f64,f64,f64,f64
1,"""05-02-2010""",1643690.9,0,42.31,2.572,211.096358,8.106
1,"""12-02-2010""",1642000.0,1,38.51,2.548,211.24217,8.106
1,"""19-02-2010""",1612000.0,0,39.93,2.514,211.289143,8.106


In [3]:
sales_data.schema

Schema([('Store', Int64),
        ('Date', String),
        ('Weekly_Sales', Float64),
        ('Holiday_Flag', Int64),
        ('Temperature', Float64),
        ('Fuel_Price', Float64),
        ('CPI', Float64),
        ('Unemployment', Float64)])

In [5]:
sales_data = pl.read_csv(path_file('sales.csv'), 
                        schema=pl.Schema({
                            'Store': pl.Int64(),
                            'Date': pl.Date(),
                            'Weekly_Sales': pl.Float64(),
                            'Holiday_Flag': pl.Int64(),
                            'Temperature': pl.Float64(),
                            'Fuel_Price': pl.Float64(),
                            'CPI': pl.Float64(),
                            'Unemployment': pl.Float64(),
                        }
                        ))
sales_data.schema

Schema([('Store', Int64),
        ('Date', Date),
        ('Weekly_Sales', Float64),
        ('Holiday_Flag', Int64),
        ('Temperature', Float64),
        ('Fuel_Price', Float64),
        ('CPI', Float64),
        ('Unemployment', Float64)])

In [6]:
sales_data.head()

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
i64,date,f64,i64,f64,f64,f64,f64
1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106
1,2010-02-12,1642000.0,1,38.51,2.548,211.24217,8.106
1,2010-02-19,1612000.0,0,39.93,2.514,211.289143,8.106
1,2010-02-26,1409700.0,0,46.63,2.561,211.319643,8.106
1,2010-03-05,1554800.0,0,46.5,2.625,211.350143,8.106


#### Manejo de Fechas

In [7]:
sales_data.filter(pl.col('Date') == '2010-02-05').head()

InvalidOperationError: cannot compare 'date/datetime/time' to a string value (create native python { 'date', 'datetime', 'time' } or compare to a temporal column)

In [8]:
from datetime import date
sales_data.filter(pl.col('Date') == date(2010,2,5)).head()

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
i64,date,f64,i64,f64,f64,f64,f64
1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106
2,2010-02-05,2137000.0,0,40.19,2.572,210.752605,8.324
3,2010-02-05,461622.22,0,45.71,2.572,214.424881,7.368
4,2010-02-05,2135100.0,0,43.76,2.598,126.442065,8.623
5,2010-02-05,317173.1,0,39.7,2.572,211.653972,6.566


In [9]:
sales_data = sales_data.with_columns(
    pl.col('Date').dt.strftime('%Y-%m').alias('Y-M')
)
sales_data.group_by('Y-M').agg(
    pl.sum('Weekly_Sales').alias('Total_Sales')
).head()

Y-M,Total_Sales
str,f64
"""2012-09""",180650000.0
"""2010-08""",187640000.0
"""2011-09""",220850000.0
"""2011-03""",179360000.0
"""2012-06""",240610000.0


In [11]:
pl.Config(set_fmt_float="full")
sales_data.group_by('Y-M').agg(
    pl.sum('Weekly_Sales').round(2).alias('Total_Sales')
).head()

Y-M,Total_Sales
str,f64
"""2010-04""",231412368.05
"""2011-05""",181648158.16
"""2010-08""",187640110.89
"""2012-02""",192063579.54
"""2010-11""",202853370.14


In [12]:
from datetime import timedelta
sales_data = sales_data.with_columns(
    (pl.col('Date') + timedelta(days=5)).alias('Date_p5')
)

In [13]:
sales_data.head()

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Y-M,Date_p5
i64,date,f64,i64,f64,f64,f64,f64,str,date
1,2010-02-05,1643690.9,0,42.31,2.572,211.0963582,8.106,"""2010-02""",2010-02-10
1,2010-02-12,1641957.44,1,38.51,2.548,211.2421698,8.106,"""2010-02""",2010-02-17
1,2010-02-19,1611968.17,0,39.93,2.514,211.2891429,8.106,"""2010-02""",2010-02-24
1,2010-02-26,1409727.59,0,46.63,2.561,211.3196429,8.106,"""2010-02""",2010-03-03
1,2010-03-05,1554806.68,0,46.5,2.625,211.3501429,8.106,"""2010-03""",2010-03-10


### Manejo de Strings

In [15]:
store = pl.read_csv(path_file('storedesc.csv'))
store.head(3)

store,city,country,address
i64,str,str,str
1,"""Honolulu""","""United States""","""09 Florence Way"""
2,"""Tulsa""","""United States""","""97 Hollow Ridge Hill"""
3,"""Tacoma""","""United States""","""1741 Center Court"""


In [47]:
store.with_columns(
    pl.col('address').str.splitn(' ', 2)
    .struct.rename_fields(['street_number', 'street_name'])
    .alias('address')
).unnest('address')

store,city,country,street_number,street_name
i64,str,str,str,str
1,"""Honolulu""","""United States""","""09""","""Florence Way"""
2,"""Tulsa""","""United States""","""97""","""Hollow Ridge Hill"""
3,"""Tacoma""","""United States""","""1741""","""Center Court"""
4,"""Madison""","""United States""","""3879""","""Valley Edge Way"""
5,"""Sacramento""","""United States""","""76""","""Ludington Way"""
…,…,…,…,…
46,"""Hampton""","""United States""","""78""","""Pine View Pass"""
47,"""Garden Grove""","""United States""","""1995""","""Elgar Crossing"""
48,"""Sacramento""","""United States""","""527""","""Bashford Terrace"""
49,"""Beaverton""","""United States""","""97205""","""Darwin Terrace"""


In [45]:
store.select(
    pl.col('address').str.replace(r'\d+', '').alias('street_name_only')
)

street_name_only
str
""" Florence Way"""
""" Hollow Ridge Hill"""
""" Center Court"""
""" Valley Edge Way"""
""" Ludington Way"""
…
""" Pine View Pass"""
""" Elgar Crossing"""
""" Bashford Terrace"""
""" Darwin Terrace"""
