# Comparando Dask vs Pandas en cuestion de tiempo de desempeño

### Dask DataFrame tuliza una parte de la API de pandas. Por lo cual seran comparados las siguientes herramientas de ambas librerias:

#### Librerias y lectura de dataset:

In [16]:
# Librerias

import pandas as pd
import dask as dk
from time import sleep
import urllib
import os
import dask
import dask.dataframe as dd
import dask.array as da
import dask.bag as dbs

In [35]:
# Leyendo nuestro dataset con Dask

print("- Downloading NYC Flights dataset... ", end='', flush=True)
url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"
filename, headers = urllib.request.urlretrieve(url, 'nycflights.tar.gz')
print("Done!", flush=True)
import tarfile

# extract the .csv files from the tar file
with tarfile.open(filename, mode='r:gz') as flights:
            flights.extractall('data/')

dfd = dd.read_csv(os.path.join('data', 'nycflights', '*.csv'),
                 parse_dates={'Date': [0, 1, 2]},
                 dtype={'TailNum': str},
                 assume_missing=True)

dfd.compute()

- Downloading NYC Flights dataset... Done!


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
0,1990-01-01,1.0,1621.0,1540.0,1747.0,1701.0,US,33.0,,86.0,...,,46.0,41.0,EWR,PIT,319.0,,,0.0,0.0
1,1990-01-02,2.0,1547.0,1540.0,1700.0,1701.0,US,33.0,,73.0,...,,-1.0,7.0,EWR,PIT,319.0,,,0.0,0.0
2,1990-01-03,3.0,1546.0,1540.0,1710.0,1701.0,US,33.0,,84.0,...,,9.0,6.0,EWR,PIT,319.0,,,0.0,0.0
3,1990-01-04,4.0,1542.0,1540.0,1710.0,1701.0,US,33.0,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,0.0,0.0
4,1990-01-05,5.0,1549.0,1540.0,1706.0,1701.0,US,33.0,,77.0,...,,5.0,9.0,EWR,PIT,319.0,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269176,1999-12-27,1.0,1645.0,1645.0,1830.0,1901.0,UA,1753.0,N516UA,225.0,...,205.0,-31.0,0.0,LGA,DEN,1619.0,7.0,13.0,0.0,0.0
269177,1999-12-28,2.0,1726.0,1645.0,1928.0,1901.0,UA,1753.0,N504UA,242.0,...,214.0,27.0,41.0,LGA,DEN,1619.0,5.0,23.0,0.0,0.0
269178,1999-12-29,3.0,1646.0,1645.0,1846.0,1901.0,UA,1753.0,N592UA,240.0,...,220.0,-15.0,1.0,LGA,DEN,1619.0,5.0,15.0,0.0,0.0
269179,1999-12-30,4.0,1651.0,1645.0,1908.0,1901.0,UA,1753.0,N575UA,257.0,...,233.0,7.0,6.0,LGA,DEN,1619.0,5.0,19.0,0.0,0.0


In [25]:
# Dataset con Pandas

dfp = pd.DataFrame(dfd.compute())


#### Operaciones trivialmente paralelizables (rápidas):

##### Operaciones elementales

In [26]:
%%time
# Pandas

# Kilometros por minuto recorrido
a = dfp.Distance / dfp.ActualElapsedTime
a

CPU times: total: 0 ns
Wall time: 13 ms


0         3.709302
1         4.369863
2         3.797619
3         3.625000
4         4.142857
            ...   
269176    7.195556
269177    6.690083
269178    6.745833
269179    6.299611
269180    6.502008
Length: 2611892, dtype: float64

In [27]:
%%time
# Dask

# Kilometros por minuto recorrido
a = dfd.Distance / dfd.ActualElapsedTime
a.compute()

CPU times: total: 2.84 s
Wall time: 3.92 s


0         3.709302
1         4.369863
2         3.797619
3         3.625000
4         4.142857
            ...   
269176    7.195556
269177    6.690083
269178    6.745833
269179    6.299611
269180    6.502008
Length: 2611892, dtype: float64

##### Selecciones por filas

In [29]:
%%time
# Pandas

b = dfp[dfp.ActualElapsedTime>100]
b

CPU times: total: 250 ms
Wall time: 1.07 s


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
10,1990-01-11,4.0,1544.0,1540.0,1726.0,1701.0,US,33.0,,102.0,...,,25.0,4.0,EWR,PIT,319.0,,,0.0,0.0
11,1990-01-12,5.0,1556.0,1540.0,1816.0,1701.0,US,33.0,,140.0,...,,75.0,16.0,EWR,PIT,319.0,,,0.0,0.0
81,1990-01-25,4.0,1954.0,1850.0,2219.0,1948.0,US,42.0,,145.0,...,,151.0,64.0,EWR,SYR,194.0,,,0.0,0.0
91,1990-01-06,6.0,1126.0,1110.0,1322.0,1243.0,US,49.0,,116.0,...,,39.0,16.0,LGA,CLE,418.0,,,0.0,0.0
97,1990-01-12,5.0,1109.0,1110.0,1258.0,1243.0,US,49.0,,109.0,...,,15.0,-1.0,LGA,CLE,418.0,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269176,1999-12-27,1.0,1645.0,1645.0,1830.0,1901.0,UA,1753.0,N516UA,225.0,...,205.0,-31.0,0.0,LGA,DEN,1619.0,7.0,13.0,0.0,0.0
269177,1999-12-28,2.0,1726.0,1645.0,1928.0,1901.0,UA,1753.0,N504UA,242.0,...,214.0,27.0,41.0,LGA,DEN,1619.0,5.0,23.0,0.0,0.0
269178,1999-12-29,3.0,1646.0,1645.0,1846.0,1901.0,UA,1753.0,N592UA,240.0,...,220.0,-15.0,1.0,LGA,DEN,1619.0,5.0,15.0,0.0,0.0
269179,1999-12-30,4.0,1651.0,1645.0,1908.0,1901.0,UA,1753.0,N575UA,257.0,...,233.0,7.0,6.0,LGA,DEN,1619.0,5.0,19.0,0.0,0.0


In [31]:
%%time
# Dask

b = dfd[dfd.ActualElapsedTime>100]
b.compute()

CPU times: total: 4.06 s
Wall time: 5.97 s


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
10,1990-01-11,4.0,1544.0,1540.0,1726.0,1701.0,US,33.0,,102.0,...,,25.0,4.0,EWR,PIT,319.0,,,0.0,0.0
11,1990-01-12,5.0,1556.0,1540.0,1816.0,1701.0,US,33.0,,140.0,...,,75.0,16.0,EWR,PIT,319.0,,,0.0,0.0
81,1990-01-25,4.0,1954.0,1850.0,2219.0,1948.0,US,42.0,,145.0,...,,151.0,64.0,EWR,SYR,194.0,,,0.0,0.0
91,1990-01-06,6.0,1126.0,1110.0,1322.0,1243.0,US,49.0,,116.0,...,,39.0,16.0,LGA,CLE,418.0,,,0.0,0.0
97,1990-01-12,5.0,1109.0,1110.0,1258.0,1243.0,US,49.0,,109.0,...,,15.0,-1.0,LGA,CLE,418.0,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269176,1999-12-27,1.0,1645.0,1645.0,1830.0,1901.0,UA,1753.0,N516UA,225.0,...,205.0,-31.0,0.0,LGA,DEN,1619.0,7.0,13.0,0.0,0.0
269177,1999-12-28,2.0,1726.0,1645.0,1928.0,1901.0,UA,1753.0,N504UA,242.0,...,214.0,27.0,41.0,LGA,DEN,1619.0,5.0,23.0,0.0,0.0
269178,1999-12-29,3.0,1646.0,1645.0,1846.0,1901.0,UA,1753.0,N592UA,240.0,...,220.0,-15.0,1.0,LGA,DEN,1619.0,5.0,15.0,0.0,0.0
269179,1999-12-30,4.0,1651.0,1645.0,1908.0,1901.0,UA,1753.0,N575UA,257.0,...,233.0,7.0,6.0,LGA,DEN,1619.0,5.0,19.0,0.0,0.0


##### Loc

In [39]:
%%time
# Pandas

c = dfp.loc[3]
c

CPU times: total: 46.9 ms
Wall time: 65.4 ms


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
3,1990-01-04,4.0,1542.0,1540.0,1710.0,1701.0,US,33.0,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,0.0,0.0
3,1991-01-11,5.0,1303.0,1215.0,1439.0,1336.0,US,121.0,,96.0,...,,63.0,48.0,EWR,PIT,319.0,,,0.0,0.0
3,1992-01-11,6.0,640.0,640.0,834.0,853.0,US,53.0,,114.0,...,,-19.0,0.0,EWR,IND,644.0,,,0.0,0.0
3,1993-01-03,7.0,1736.0,1729.0,1838.0,1831.0,US,70.0,,62.0,...,,7.0,7.0,LGA,SYR,198.0,,,0.0,0.0
3,1994-01-04,2.0,,1400.0,,1507.0,US,241.0,,,...,,,,EWR,BOS,200.0,,,1.0,0.0
3,1995-01-04,3.0,1952.0,1950.0,12.0,2337.0,CO,401.0,N14346,380.0,...,334.0,35.0,2.0,EWR,PHX,2133.0,7.0,39.0,0.0,0.0
3,1996-01-04,4.0,937.0,930.0,1329.0,1307.0,CO,1645.0,N14320,352.0,...,324.0,22.0,7.0,EWR,PHX,2133.0,5.0,23.0,0.0,0.0
3,1997-01-04,6.0,939.0,935.0,1257.0,1311.0,CO,1915.0,N16339,318.0,...,298.0,-14.0,4.0,EWR,PHX,2133.0,7.0,13.0,0.0,0.0
3,1998-01-04,7.0,754.0,755.0,1112.0,1125.0,CO,1915.0,N19638,318.0,...,297.0,-13.0,-1.0,EWR,PHX,2133.0,7.0,14.0,0.0,0.0
3,1999-01-04,1.0,1605.0,1515.0,1923.0,1849.0,CO,1923.0,N14337,318.0,...,292.0,34.0,50.0,EWR,PHX,2133.0,5.0,21.0,0.0,0.0


In [41]:
%%time
# Dask

c = dfd.loc[3]
c.compute()

CPU times: total: 2.69 s
Wall time: 3.89 s


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
3,1990-01-04,4.0,1542.0,1540.0,1710.0,1701.0,US,33.0,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,0.0,0.0
3,1991-01-11,5.0,1303.0,1215.0,1439.0,1336.0,US,121.0,,96.0,...,,63.0,48.0,EWR,PIT,319.0,,,0.0,0.0
3,1992-01-11,6.0,640.0,640.0,834.0,853.0,US,53.0,,114.0,...,,-19.0,0.0,EWR,IND,644.0,,,0.0,0.0
3,1993-01-03,7.0,1736.0,1729.0,1838.0,1831.0,US,70.0,,62.0,...,,7.0,7.0,LGA,SYR,198.0,,,0.0,0.0
3,1994-01-04,2.0,,1400.0,,1507.0,US,241.0,,,...,,,,EWR,BOS,200.0,,,1.0,0.0
3,1995-01-04,3.0,1952.0,1950.0,12.0,2337.0,CO,401.0,N14346,380.0,...,334.0,35.0,2.0,EWR,PHX,2133.0,7.0,39.0,0.0,0.0
3,1996-01-04,4.0,937.0,930.0,1329.0,1307.0,CO,1645.0,N14320,352.0,...,324.0,22.0,7.0,EWR,PHX,2133.0,5.0,23.0,0.0,0.0
3,1997-01-04,6.0,939.0,935.0,1257.0,1311.0,CO,1915.0,N16339,318.0,...,298.0,-14.0,4.0,EWR,PHX,2133.0,7.0,13.0,0.0,0.0
3,1998-01-04,7.0,754.0,755.0,1112.0,1125.0,CO,1915.0,N19638,318.0,...,297.0,-13.0,-1.0,EWR,PHX,2133.0,7.0,14.0,0.0,0.0
3,1999-01-04,1.0,1605.0,1515.0,1923.0,1849.0,CO,1923.0,N14337,318.0,...,292.0,34.0,50.0,EWR,PHX,2133.0,5.0,21.0,0.0,0.0


##### Agregaciones comunes

In [32]:
%%time
# Pandas

d = dfp.ActualElapsedTime.max()
d

CPU times: total: 0 ns
Wall time: 24 ms


795.0

In [34]:
%%time
# Dask

d = dfd.ActualElapsedTime.max()
d.compute()


CPU times: total: 2.58 s
Wall time: 3.81 s


795.0

##### Is in

In [37]:
%%time
# Pandas

e = dfp[dfp.Dest.isin(['PIT','DEN'])]
e


CPU times: total: 31.2 ms
Wall time: 87 ms


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
0,1990-01-01,1.0,1621.0,1540.0,1747.0,1701.0,US,33.0,,86.0,...,,46.0,41.0,EWR,PIT,319.0,,,0.0,0.0
1,1990-01-02,2.0,1547.0,1540.0,1700.0,1701.0,US,33.0,,73.0,...,,-1.0,7.0,EWR,PIT,319.0,,,0.0,0.0
2,1990-01-03,3.0,1546.0,1540.0,1710.0,1701.0,US,33.0,,84.0,...,,9.0,6.0,EWR,PIT,319.0,,,0.0,0.0
3,1990-01-04,4.0,1542.0,1540.0,1710.0,1701.0,US,33.0,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,0.0,0.0
4,1990-01-05,5.0,1549.0,1540.0,1706.0,1701.0,US,33.0,,77.0,...,,5.0,9.0,EWR,PIT,319.0,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269176,1999-12-27,1.0,1645.0,1645.0,1830.0,1901.0,UA,1753.0,N516UA,225.0,...,205.0,-31.0,0.0,LGA,DEN,1619.0,7.0,13.0,0.0,0.0
269177,1999-12-28,2.0,1726.0,1645.0,1928.0,1901.0,UA,1753.0,N504UA,242.0,...,214.0,27.0,41.0,LGA,DEN,1619.0,5.0,23.0,0.0,0.0
269178,1999-12-29,3.0,1646.0,1645.0,1846.0,1901.0,UA,1753.0,N592UA,240.0,...,220.0,-15.0,1.0,LGA,DEN,1619.0,5.0,15.0,0.0,0.0
269179,1999-12-30,4.0,1651.0,1645.0,1908.0,1901.0,UA,1753.0,N575UA,257.0,...,233.0,7.0,6.0,LGA,DEN,1619.0,5.0,19.0,0.0,0.0


In [38]:
%%time
# Dask

e = dfd[dfd.Dest.isin(['PIT','DEN'])]
e.compute()

CPU times: total: 3.55 s
Wall time: 4.75 s


Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
0,1990-01-01,1.0,1621.0,1540.0,1747.0,1701.0,US,33.0,,86.0,...,,46.0,41.0,EWR,PIT,319.0,,,0.0,0.0
1,1990-01-02,2.0,1547.0,1540.0,1700.0,1701.0,US,33.0,,73.0,...,,-1.0,7.0,EWR,PIT,319.0,,,0.0,0.0
2,1990-01-03,3.0,1546.0,1540.0,1710.0,1701.0,US,33.0,,84.0,...,,9.0,6.0,EWR,PIT,319.0,,,0.0,0.0
3,1990-01-04,4.0,1542.0,1540.0,1710.0,1701.0,US,33.0,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,0.0,0.0
4,1990-01-05,5.0,1549.0,1540.0,1706.0,1701.0,US,33.0,,77.0,...,,5.0,9.0,EWR,PIT,319.0,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269176,1999-12-27,1.0,1645.0,1645.0,1830.0,1901.0,UA,1753.0,N516UA,225.0,...,205.0,-31.0,0.0,LGA,DEN,1619.0,7.0,13.0,0.0,0.0
269177,1999-12-28,2.0,1726.0,1645.0,1928.0,1901.0,UA,1753.0,N504UA,242.0,...,214.0,27.0,41.0,LGA,DEN,1619.0,5.0,23.0,0.0,0.0
269178,1999-12-29,3.0,1646.0,1645.0,1846.0,1901.0,UA,1753.0,N592UA,240.0,...,220.0,-15.0,1.0,LGA,DEN,1619.0,5.0,15.0,0.0,0.0
269179,1999-12-30,4.0,1651.0,1645.0,1908.0,1901.0,UA,1753.0,N575UA,257.0,...,233.0,7.0,6.0,LGA,DEN,1619.0,5.0,19.0,0.0,0.0


##### Accesores de cadena/fecha/hora

In [55]:
%%time
# Pandas

dfp['Date'] = pd.to_datetime(dfp['Date'])
f = dfp['Date'].dt.month
f

CPU times: total: 93.8 ms
Wall time: 196 ms


0          1
1          1
2          1
3          1
4          1
          ..
269176    12
269177    12
269178    12
269179    12
269180    12
Name: Date, Length: 2611892, dtype: int64

In [50]:
%%time
# Dask

dfd['Date'] = dd.to_datetime(dfd['Date'])
f = dfd['Date'].dt.month
f.compute()

CPU times: total: 3.16 s
Wall time: 4.65 s


0          1
1          1
2          1
3          1
4          1
          ..
269176    12
269177    12
269178    12
269179    12
269180    12
Name: Date, Length: 2611892, dtype: int64

#### Operaciones inteligentemente paralelizables (rápidas):

##### Groupby-aggregate

In [52]:
%%time
# Pandas

g = dfp.groupby('Dest').ActualElapsedTime.max()
g

CPU times: total: 0 ns
Wall time: 219 ms


Dest
ABE     85.0
ABQ    290.0
ACK    221.0
ALB    134.0
ANC    430.0
       ...  
SWF     84.0
SYR    381.0
TPA    536.0
TUS    342.0
TYS    245.0
Name: ActualElapsedTime, Length: 99, dtype: float64

In [54]:
%%time
# Dask

g = dfd.groupby('Dest').ActualElapsedTime.max()
g.compute()


CPU times: total: 2.8 s
Wall time: 4.2 s


Dest
ABE     85.0
ALB    134.0
ATL    512.0
BDL    267.0
BGR    179.0
       ...  
AUS    462.0
CRP    269.0
HNL    795.0
BHM    191.0
TUS    342.0
Name: ActualElapsedTime, Length: 99, dtype: float64

##### Value_counts

In [58]:
%%time
# Pandas

h = dfp.Dest.value_counts()
h


CPU times: total: 46.9 ms
Wall time: 172 ms


ORD    229725
BOS    152221
ATL    132372
MIA    112541
LAX    111703
        ...  
JFK         6
CRP         2
TUS         2
ABQ         1
STX         1
Name: Dest, Length: 99, dtype: int64

In [60]:
%%time
# Dask

h = dfd.Dest.value_counts()
h.compute()

CPU times: total: 2.86 s
Wall time: 4.52 s


ORD    229725
BOS    152221
ATL    132372
MIA    112541
LAX    111703
        ...  
JFK         6
CRP         2
TUS         2
ABQ         1
STX         1
Name: Dest, Length: 99, dtype: int64

##### Quitar duplicados

In [61]:
%%time
# Pandas

i = dfp.Dest.drop_duplicates()
i

CPU times: total: 31.2 ms
Wall time: 80.1 ms


0         PIT
31        ORF
58        SYR
88        CLE
177       BUF
         ... 
118535    HNL
186343    ANC
212518    CRP
37941     BHM
251134    TUS
Name: Dest, Length: 99, dtype: object

In [62]:
%%time
# Dask

i = dfd.Dest.drop_duplicates()
i.compute()

CPU times: total: 2.83 s
Wall time: 4.25 s


0         PIT
31        ORF
58        SYR
88        CLE
177       BUF
         ... 
118535    HNL
186343    ANC
212518    CRP
37941     BHM
251134    TUS
Name: Dest, Length: 99, dtype: object

##### Hacer join por indice

In [72]:
%%time
# Pandas

df_ex = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

other_ex = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})

j = df_ex.set_index('key').join(other_ex.set_index('key'))
j

CPU times: total: 0 ns
Wall time: 3.01 ms


Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


In [73]:
%%time
# Dask

df_ex = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

df_ex_d = dd.from_pandas(df_ex, npartitions = 2)

other_ex = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})

other_ex_d = dd.from_pandas(other_ex, npartitions = 2)

j = df_ex_d.set_index('key').join(other_ex_d.set_index('key'))
j.compute()

CPU times: total: 0 ns
Wall time: 64.6 ms


Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


# Conclusiones

<ul>
<li type="square"><em><strong>Dask DataFrame se usa en situaciones en las que comúnmente se necesitan pandas, generalmente cuando fallan debido al tamaño de los datos o la velocidad de cálculo:</strong></em></li>

<br>
    <li type="circle">Manipulación de grandes conjuntos de datos, incluso cuando esos conjuntos de datos no caben en la memoria</li>

<br>
    <li type="circle">Aceleración de cálculos largos mediante el uso de muchos núcleos</li>

<br>
<li type="circle">Cómputo distribuido en grandes conjuntos de datos con operaciones estándar de pandas como agrupar, unir y cómputos de series temporales</li>

<ul>
<li type="square"><em><strong>Dask DataFrame puede no ser la mejor opción en las siguientes situaciones:</strong></em></li>

<br>
<li type="circle">Si su conjunto de datos cabe cómodamente en la memoria RAM de su computadora portátil, entonces es mejor que use pandas. Puede haber formas más sencillas de mejorar el rendimiento que a través del paralelismo</li>

<br>
<li type="circle">Si su conjunto de datos no encaja perfectamente en el modelo tabular de pandas, entonces podría encontrar más uso en dask.bag o dask.array</li>

<br>
<li type="circle">Si necesita funciones que no están implementadas en Dask DataFrame, es posible que desee ver dask.delayed que ofrece más flexibilidad</li>

<br>
<li type="circle">Si necesita una base de datos adecuada con todo lo que ofrecen las bases de datos, es posible que prefiera algo como Postgres</li>