In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 2801404980097387619, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14674281152
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1211993217599460608
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"]

In [45]:
import pandas as pd
import numpy as np
import torch

import gc
from tqdm import tqdm
import time
from IPython.display import display

In [46]:
df = pd.read_csv("https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/07_Visualization/Online_Retail/Online_Retail.csv", encoding='cp932')
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


In [None]:
df = df[['InvoiceNo', 'CustomerID', 'UnitPrice', 'Quantity']]
df["InvoiceNo"] = df["InvoiceNo"].str.strip('CA').astype("int32")

In [48]:
df = df.astype("float64")

In [49]:
df.dtypes

InvoiceNo     float64
CustomerID    float64
UnitPrice     float64
Quantity      float64
dtype: object

In [63]:
print(df.shape)
df.head()

(541909, 4)


Unnamed: 0,InvoiceNo,CustomerID,UnitPrice,Quantity
0,536365.0,17850.0,2.55,6.0
1,536365.0,17850.0,3.39,6.0
2,536365.0,17850.0,2.75,8.0
3,536365.0,17850.0,3.39,6.0
4,536365.0,17850.0,3.39,6.0


## Pandas

In [50]:
sale_dict = {}

start = time.time()

for i in tqdm(df["CustomerID"].unique()[~np.isnan(df["CustomerID"].unique())]):
    for j in df[df["CustomerID"]==i]["InvoiceNo"].unique():
        sale_dict[f"{i}_{j}_sale"] = df[(df["CustomerID"]==i) & (df["InvoiceNo"]==j)]["Quantity"] * df[(df["CustomerID"]==i) & (df["InvoiceNo"]==j)]["UnitPrice"]

elapsed_time = time.time() - start
print (f"elapsed_time:{elapsed_time}[sec]")

100%|██████████| 4372/4372 [01:58<00:00, 36.98it/s] 

elapsed_time:118.24762082099915[sec]





## Numpy

In [51]:
df_np = df.values.copy()

In [52]:
df_np

array([[5.36365e+05, 1.78500e+04, 2.55000e+00, 6.00000e+00],
       [5.36365e+05, 1.78500e+04, 3.39000e+00, 6.00000e+00],
       [5.36365e+05, 1.78500e+04, 2.75000e+00, 8.00000e+00],
       ...,
       [5.81587e+05, 1.26800e+04, 4.15000e+00, 4.00000e+00],
       [5.81587e+05, 1.26800e+04, 4.15000e+00, 4.00000e+00],
       [5.81587e+05, 1.26800e+04, 4.95000e+00, 3.00000e+00]])

In [53]:
df_np.dtype

dtype('float64')

In [54]:
sale_dict = {}

start = time.time()

for i in tqdm(np.unique(df_np[:, 1][~np.isnan(df_np[:, 1])])):
    for j in np.unique(df_np[df_np[:, 1]==i][:, 0]):
        sale_dict[f"{i}_{j}_sale"] = df_np[(df_np[:, 1]==i) & (df_np[:, 0]==j)][:, 3] * df_np[(df_np[:, 1]==i) & (df_np[:, 0]==j)][:, 2]

elapsed_time = time.time() - start
print (f"elapsed_time:{elapsed_time}[sec]")

100%|██████████| 4372/4372 [02:28<00:00, 29.35it/s]

elapsed_time:148.98866844177246[sec]





## Tensor GPUなし

In [95]:
df_tensor = torch.from_numpy(df_np).clone()

In [56]:
df_tensor

tensor([[5.3636e+05, 1.7850e+04, 2.5500e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 3.3900e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 2.7500e+00, 8.0000e+00],
        ...,
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.9500e+00, 3.0000e+00]], dtype=torch.float64)

In [57]:
df_tensor.dtype

torch.float64

In [58]:
sale_dict = {}

start = time.time()

for i in tqdm(df_tensor[:, 1][~torch.isnan(df_tensor[:, 1])].unique()):
    for j in df_tensor[df_tensor[:, 1]==i][:, 0].unique():
        sale_dict[f"{i}_{j}_sale"] = df_tensor[(df_tensor[:, 1]==i) & (df_tensor[:, 0]==j)][:, 3] * df_tensor[(df_tensor[:, 1]==i) & (df_tensor[:, 0]==j)][:, 2]

elapsed_time = time.time() - start
print (f"elapsed_time:{elapsed_time}[sec]")

100%|██████████| 4372/4372 [02:42<00:00, 26.93it/s]

elapsed_time:162.37371969223022[sec]





## Tensor GPUあり

In [59]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
df_tensor = df_tensor.to(device)

In [60]:
df_tensor

tensor([[5.3636e+05, 1.7850e+04, 2.5500e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 3.3900e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 2.7500e+00, 8.0000e+00],
        ...,
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.9500e+00, 3.0000e+00]], device='cuda:0',
       dtype=torch.float64)

In [61]:
df_tensor.dtype

torch.float64

In [62]:
sale_dict = {}

start = time.time()

for i in tqdm(df_tensor[:, 1][~torch.isnan(df_tensor[:, 1])].unique()):
    for j in df_tensor[df_tensor[:, 1]==i][:, 0].unique():
        sale_dict[f"{i}_{j}_sale"] = df_tensor[(df_tensor[:, 1]==i) & (df_tensor[:, 0]==j)][:, 3] * df_tensor[(df_tensor[:, 1]==i) & (df_tensor[:, 0]==j)][:, 2]

elapsed_time = time.time() - start
print (f"elapsed_time:{elapsed_time}[sec]")

100%|██████████| 4372/4372 [00:13<00:00, 329.63it/s]

elapsed_time:13.271113634109497[sec]





In [64]:
df

Unnamed: 0,InvoiceNo,CustomerID,UnitPrice,Quantity
0,536365.0,17850.0,2.55,6.0
1,536365.0,17850.0,3.39,6.0
2,536365.0,17850.0,2.75,8.0
3,536365.0,17850.0,3.39,6.0
4,536365.0,17850.0,3.39,6.0
...,...,...,...,...
541904,581587.0,12680.0,0.85,12.0
541905,581587.0,12680.0,2.10,6.0
541906,581587.0,12680.0,4.15,4.0
541907,581587.0,12680.0,4.15,4.0


In [65]:
df_np

array([[5.36365e+05, 1.78500e+04, 2.55000e+00, 6.00000e+00],
       [5.36365e+05, 1.78500e+04, 3.39000e+00, 6.00000e+00],
       [5.36365e+05, 1.78500e+04, 2.75000e+00, 8.00000e+00],
       ...,
       [5.81587e+05, 1.26800e+04, 4.15000e+00, 4.00000e+00],
       [5.81587e+05, 1.26800e+04, 4.15000e+00, 4.00000e+00],
       [5.81587e+05, 1.26800e+04, 4.95000e+00, 3.00000e+00]])

In [66]:
df_tensor

tensor([[5.3636e+05, 1.7850e+04, 2.5500e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 3.3900e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 2.7500e+00, 8.0000e+00],
        ...,
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.9500e+00, 3.0000e+00]], device='cuda:0',
       dtype=torch.float64)

In [68]:
df["InvoiceNo"].isin([536365.0, 581587.0])

0         True
1         True
2         True
3         True
4         True
          ... 
541904    True
541905    True
541906    True
541907    True
541908    True
Name: InvoiceNo, Length: 541909, dtype: bool

In [70]:
np.isin(df_np[:, 0], [536365.0, 581587.0])

array([ True,  True,  True, ...,  True,  True,  True])

In [97]:
(df_tensor[:, 0, None] == torch.tensor([536365.0, 581587.0])).any(-1)

tensor([True, True, True,  ..., True, True, True])

In [91]:
df_tensor

tensor([[5.3636e+05, 1.7850e+04, 2.5500e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 3.3900e+00, 6.0000e+00],
        [5.3636e+05, 1.7850e+04, 2.7500e+00, 8.0000e+00],
        ...,
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.1500e+00, 4.0000e+00],
        [5.8159e+05, 1.2680e+04, 4.9500e+00, 3.0000e+00]], device='cuda:0',
       dtype=torch.float64)

In [93]:
df_tensor[:, 0, None]==torch.tensor(536365.)

tensor([[ True],
        [ True],
        [ True],
        ...,
        [False],
        [False],
        [False]], device='cuda:0')

In [86]:
df_tensor[:, 0, None] 

tensor([[536365.],
        [536365.],
        [536365.],
        ...,
        [581587.],
        [581587.],
        [581587.]], device='cuda:0', dtype=torch.float64)