# Import Moudles
---

Import CPU vs GPU all code

If you want to see how many utilize your gpu, run below on terminal

```cmd
watch -n 0.1 nvidia-smi
```

Dask Visualizer need to install graphvis jupyter-cytoscape

```cmd
conda install graphvis ipycytoscape -y
```

In [1]:
import time

import cupy as cp

import cudf
import dask
import dask_cudf
import dask.dataframe as dd
dask.config.set({"dataframe.backend": "cudf"})

from cuml.ensemble    import RandomForestClassifier as RF

In [2]:
class Timer:
    def __init__(self, name="cpu"):
        self.name = name
        
    def __enter__(self):
        self.start = time.perf_counter()
        
    def __exit__(self, type, value, trackback):
        self.end = time.perf_counter()
        self.execute_time = self.end - self.start
        print(f"{self.name} execute time : {self.execute_time:.4f} seconds")

In [3]:
import os
import glob

path = glob.glob(os.path.join("data", "*"))
print(path)

['data/gender_submission.csv', 'data/train.csv', 'data/train.parquet', 'data/test.csv', 'data/test.parquet']


In [4]:
with Timer(name="gpu read csv") as gpu_time:
    train_data_gpu = cudf.read_csv("./data/train.csv")
    test_data_gpu  = cudf.read_csv("./data/test.csv")

with Timer(name="gpu to parquet") as gpu_time:
    train_data_gpu.to_parquet("./data/train.parquet")
    test_data_gpu.to_parquet("./data/test.parquet")



gpu read csv execute time : 0.3269 seconds
gpu to parquet execute time : 0.0179 seconds


In [5]:
with Timer(name="dask read parquet") as dask_time:
    train_dask = dd.read_parquet("./data/train.parquet")
    test_dask  = dd.read_parquet("./data/test.parquet")

dask read parquet execute time : 0.0228 seconds


In [6]:
train_dask.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [7]:
test_dask.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

# Compute vs Persist

In [8]:
with Timer(name="gpu") as gpu_time:
    women = train_dask.loc[train_dask['Sex'] == 'female']["Survived"]
    rate_women = women.sum()/len(women)    
    print(f"% of women who survived: {rate_women.compute()}")
    
    men = train_dask.loc[train_dask['Sex'] == 'male']["Survived"]
    rate_men = men.sum()/len(men)    
    print(f"% of men who survived: {rate_men.compute()}")

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924
gpu execute time : 0.2799 seconds


In [9]:
rate_men.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [10]:
train_dask = train_dask.persist()
test_dask  = test_dask.persist()

In [11]:
with Timer(name="gpu") as gpu_time:
    women = train_dask.loc[train_dask['Sex'] == 'female']["Survived"]
    rate_women = women.sum()/len(women)    
    print(f"% of women who survived: {rate_women.compute()}")
    
    men = train_dask.loc[train_dask['Sex'] == 'male']["Survived"]
    rate_men = men.sum()/len(men)    
    print(f"% of men who survived: {rate_men.compute()}")

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924
gpu execute time : 0.0314 seconds


In [12]:
rate_men.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [13]:
with Timer(name="gpu run") as gpu_time:
    y = train_dask["Survived"].persist()
    
    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = dd.reshape.get_dummies(train_dask[features].categorize()).astype(cp.float32)
    X_test = dd.reshape.get_dummies(test_dask[features].categorize()).astype(cp.float32)
    
    model = RF(n_estimators=100, max_depth=5, random_state=1, n_streams=1)
    model.fit(X, y)
    predictions = model.predict(X_test.compute())
    
    output = cudf.DataFrame({'PassengerId': test_data_gpu.PassengerId, 'Survived': predictions})
    
with Timer(name="gpu save") as gpu_time:    
    output.to_csv('submission_RF_dask.csv', index=False)
    print("Your submission was successfully saved!")

gpu run execute time : 0.3584 seconds
Your submission was successfully saved!
gpu save execute time : 0.0031 seconds


In [14]:
with Timer(name="gpu run") as gpu_time:
    y = train_dask["Survived"].persist()
    
    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = dd.reshape.get_dummies(train_dask[features].categorize()).astype(cp.float32)
    X_test = dd.reshape.get_dummies(test_dask[features].categorize()).astype(cp.float32)
    
    model = RF(n_estimators=10000, max_depth=100, random_state=1, n_streams=1)
    model.fit(X, y)
    predictions = model.predict(X_test.compute())
    
    output = cudf.DataFrame({'PassengerId': test_data_gpu.PassengerId, 'Survived': predictions})
    
with Timer(name="gpu save") as gpu_time:    
    output.to_csv('submission_RF_dask.csv', index=False)
    print("Your submission was successfully saved!")

gpu run execute time : 14.6617 seconds
Your submission was successfully saved!
gpu save execute time : 0.0013 seconds
