# Dask VS PAndas Dataframes

In [156]:
%pip install dask[dataframe]

Note: you may need to restart the kernel to use updated packages.


In [157]:
import pandas as pd
import dask.dataframe as dd

### 1. Load the Titanic dataset

- Pandas: Loads the entire dataset into memory. For smaller datasets like the Titanic dataset, this works efficiently. However, for larger datasets that exceed available memory, Pandas may cause memory issues or crashes.
- Dask: Loads the dataset in chunks or partitions. It doesn't load the entire dataset into memory at once, allowing it to handle datasets that are larger than your RAM.

In [158]:
# using Pandas
titanic_pandas = pd.read_csv('titanic.csv')

In [159]:
# using Dask
titanic_dask = dd.read_csv('titanic.csv')

### 2. Viewing and Info

- Pandas: Performs all viewing and information-gathering operations directly because the data is loaded fully into memory. This leads to fast, immediate access to properties like `shape`, `info()`, and `head()` for smaller datasets.
- Dask: Works with chunks and lazy evaluation. It doesn't load the entire dataset at once, so operations like `info()`, `head()`, and `shape` are slower or incomplete until you explicitly compute the results. This introduces some overhead, particularly for small datasets, but scales well for large ones.

In [160]:
titanic_pandas.head()
titanic_dask.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [161]:
titanic_pandas.shape

(891, 12)

In [162]:
titanic_dask.shape
# (delayed object, cols)

(<dask_expr.expr.Scalar: expr=ReadCSV(9cb1d1b).size() // 12, dtype=int64>, 12)

### Dask is a lazy operation, so you need to call `.compute()` to see the changes.

In [163]:
# The Actual Shape in Dask
rows = titanic_dask.shape[0].compute()  # Compute the number of rows
columns = titanic_dask.shape[1]  # Number of columns (already known)
print(f"({rows}, {columns})")

(891, 12)


In [164]:
titanic_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [165]:
titanic_dask.info()

<class 'dask_expr.DataFrame'>
Columns: 12 entries, PassengerId to Embarked
dtypes: float64(2), int64(5), string(5)

In [166]:
# full dataset details 
titanic_dask.compute().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 3. Filtering Data

In [167]:
survived_pandas = titanic_pandas[titanic_pandas['Survived'] == 1]

In [168]:
survived_dask = titanic_dask[titanic_dask['Survived'] == 1]
survived_dask.compute()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


### 4. Handling Missing Data

In [169]:
# check null values
print(titanic_pandas.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [170]:
titanic_pandas['Age'] = titanic_pandas['Age'].fillna(titanic_pandas['Age'].mean())

In [171]:
titanic_pandas.drop(columns=['Cabin'], inplace=True)

In [172]:
# Check null values 
print(titanic_dask.isnull().sum().compute())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [173]:
titanic_dask['Age'] = titanic_dask['Age'].fillna(titanic_dask['Age'].mean())

In [174]:
titanic_dask = titanic_dask.drop(columns=['Cabin'])

In [175]:
# Compute (if Needed)
titanic_dask = titanic_dask.compute()

In [176]:
print(titanic_dask.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64


### 5. Statistical Analysis

### The describe() method returns description of the data in the DataFrame.

**If the DataFrame contains numerical data, the description contains these information for each column:**

method|return
------|------
count|The number of not-empty values.
mean|The average (mean) value
std|The standard deviation
min|the minimum value
25%|The 25% percentile
50%|The 50% percentile
75%|The 75% percentile
max|the maximum value

**Percentile meaning: how many of the values are less than the given percentile.**

In [177]:
titanic_pandas.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [178]:
titanic_dask.describe().compute()

AttributeError: 'DataFrame' object has no attribute 'compute'