In [7]:
from sklearn.datasets import fetch_california_housing
from dask.distributed import Client
import dask.dataframe as dd

# Parallel computing.
client = Client()

In [8]:
ds = fetch_california_housing(as_frame=True)
# Parse dataset into dask.
ddf = dd.from_pandas(ds.frame, npartitions=4)

# Check total number of rows and columns in the dataset.
print(f"Total number of samples is '{ddf.shape[0].compute()}' and there are '{len(ddf.columns)}' columns.", end="\n\n")

# Obtain a brief look of the dataset.
print(ddf.head(), end="\n\n")  # Muestra las primeras filas

# Show data types.
print(ddf.dtypes) 

# (Optional) See dataset description.
# print(f"Dataset contextual information:\n {ds.DESCR}")

Total number of samples is '20640' and there are '9' columns.

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

MedInc         float64
HouseAge       float64
AveRooms       float64
AveBedrms      float64
Population     float64
AveOccup       float64
Latitude       float64
Longitude      float64
MedHouseVal    float64
dtype: object


In [11]:
# Obtain general metrics.
print(ddf.describe().compute())

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.750000     22.000000      4.655240      1.009249    833.000000   
50%        3.726950     32.000000      5.444261      1.055370   1261.000000   
75%        4.995950     41.000000      6.281276      1.108583   1883.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1.153956  
min        0.692308     32.54000

In [None]:
# Filter column with condition.
print(ddf[ddf['MedHouseVal'] > 3].head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [10]:
# Collapse columns from its unique values.
print(ddf.groupby("AveRooms")["MedHouseVal"].mean().compute())

AveRooms
1.000000     1.375
1.260870     1.125
1.378486     2.250
1.411290     1.375
1.465753     2.750
             ...  
22.823529    1.063
23.994152    0.991
24.669118    0.889
28.600000    4.200
37.063492    1.325
Name: MedHouseVal, Length: 19392, dtype: float64


In [None]:
# Apply function in a single column (WITHOUT PARALLEL COMPUTING).
print(ddf["Population"].apply(lambda x: x/2, meta=('Population', 'float64')).head())

0     161.0
1    1200.5
2     248.0
3     279.0
4     282.5
Name: Population, dtype: float64


In [24]:
# Self-defined function.
def increment_prices(df):
    df['MedHouseVal'] = df['MedHouseVal'] * 1.1  # +10%
    return df

# Apply in all partitions (PARALLEL COMPUTING).
print(ddf.map_partitions(increment_prices).head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23       4.9786  
1    -122.22       3.9435  
2    -122.24       3.8731  
3    -122.25       3.7543  
4    -122.25       3.7642  
