In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import torch

In [2]:
print(pd.__version__)
print(np.__version__)
print(torch.__version__)

1.1.5
1.19.2
1.10.0


# 0. Pure Python

In [3]:
import random

time_start = datetime.now()

l = [random.randrange(100, 999) for i in range(100000000)]
squared = [x**2 for x in l]
sqrt = [x**0.5 for x in l]
mul = [x * y for x, y in zip(squared, sqrt)]
div = [x / y for x, y in zip(squared, sqrt)]
int_div = [x // y for x, y in zip(squared, sqrt)]

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 162 seconds


# 1. Pandas
- Required data: wiki text: https://www.kaggle.com/kenshoresearch/kensho-derived-wikimedia-data
- Download the `item.csv` file and put it under `data` folder

## 1.1 Data Loading

In [4]:
time_start = datetime.now()

items = pd.read_csv('data/item.csv')

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 136 seconds


In [5]:
items.shape

(51450316, 3)

## 1.2 Data Manipulation

In [6]:
time_start = datetime.now()

df = pd.DataFrame()
df['X'] = np.random.randint(low=100, high=999, size=100000000)
df['X_squared'] = df['X'].apply(lambda x: x**2)
df['X_sqrt'] = df['X'].apply(lambda x: x**0.5)
df['Mul'] = df['X_squared'] * df['X_sqrt']
df['Div'] = df['X_squared'] / df['X_sqrt']
df['Int_div'] = df['X_squared'] // df['X_sqrt']

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 100 seconds


## 1.3 Groupby

In [7]:
df = pd.DataFrame()
df['X'] = np.random.randint(low=0, high=999, size=100000000)
df['Y'] = np.random.randint(low=1, high=10, size=100000000)

In [8]:
time_start = datetime.now()

df.groupby('Y').agg(np.mean)
df.groupby('Y').agg(np.std)
df.groupby('Y').agg(max)
df.groupby('Y').agg('count')

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 10 seconds


## 1.4 Simple Query

In [9]:
from tqdm import tqdm

In [10]:
time_start = datetime.now()

for _ in tqdm(range(200)):
    num = np.random.randint(low=0, high=999)
    df.query('X == ' + str(num))

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

100%|██████████| 200/200 [02:57<00:00,  1.12it/s]

TOTAL RUNTIME = 177 seconds





# 2. Numpy
- Reference: https://gist.github.com/markus-beuckelmann/8bc25531b11158431a5b09a45abd6276

In [11]:
from time import time
# Let's take the randomness out of random numbers (for reproducibility)
np.random.seed(0)



size = 4096
A, B = np.random.random((size, size)), np.random.random((size, size))
C, D = np.random.random((size * 128,)), np.random.random((size * 128,))
E = np.random.random((int(size / 2), int(size / 4)))
F = np.random.random((int(size / 2), int(size / 2)))
F = np.dot(F, F.T)
G = np.random.random((int(size / 2), int(size / 2)))

time_start = datetime.now()

# Matrix multiplication
N = 20
t = time()
for i in range(N):
    np.dot(A, B)
delta = time() - t
print('Dotted two %dx%d matrices in %0.2f s.' % (size, size, delta / N))
del A, B

# Vector multiplication
N = 5000
t = time()
for i in range(N):
    np.dot(C, D)
delta = time() - t
print('Dotted two vectors of length %d in %0.2f ms.' % (size * 128, 1e3 * delta / N))
del C, D

# Singular Value Decomposition (SVD)
N = 3
t = time()
for i in range(N):
    np.linalg.svd(E, full_matrices = False)
delta = time() - t
print("SVD of a %dx%d matrix in %0.2f s." % (size / 2, size / 4, delta / N))
del E

# Cholesky Decomposition
N = 3
t = time()
for i in range(N):
    np.linalg.cholesky(F)
delta = time() - t
print("Cholesky decomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N))

# Eigendecomposition
t = time()
for i in range(N):
    np.linalg.eig(G)
delta = time() - t
print("Eigendecomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N))

print('')

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

Dotted two 4096x4096 matrices in 2.70 s.
Dotted two vectors of length 524288 in 0.33 ms.
SVD of a 2048x1024 matrix in 0.84 s.
Cholesky decomposition of a 2048x2048 matrix in 0.14 s.
Eigendecomposition of a 2048x2048 matrix in 6.13 s.

TOTAL RUNTIME = 77 seconds


# 3. Scikit

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

time_start = datetime.now()

# Dataset
iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
time_load = datetime.now()
print(f'Dataset loaded, runtime = {(time_load - time_start).seconds} seconds')

# Train/Test split
X = iris.drop('species', axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
time_split = datetime.now()
print(f'Train/test split, runtime = {(time_split - time_start).seconds} seconds')

Dataset loaded, runtime = 0 seconds
Train/test split, runtime = 0 seconds


## 3.1 Decision Tree & Grid Search

In [13]:
from tqdm import tqdm

In [14]:
time_start = datetime.now()

for _ in tqdm(range(10000)):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    
time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

100%|██████████| 10000/10000 [00:14<00:00, 687.59it/s]

TOTAL RUNTIME = 14 seconds





In [15]:
time_start = datetime.now()

# Hyperparameter tuning
model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 5, 10, 50, 100, 250, 500, 1000],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['auto', 'sqrt', 'log2']
}
clf = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    cv=5
)
clf.fit(X_train, y_train)
time_optim = datetime.now()
print(f'Hyperparameter optimization, runtime = {(time_optim - time_start).seconds} seconds')

best_model = DecisionTreeClassifier(**clf.best_params_)
best_model.fit(X_train, y_train)

time_end = datetime.now()
print()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

Hyperparameter optimization, runtime = 92 seconds

TOTAL RUNTIME = 92 seconds


## 3.2 SVM & Grid Search

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('data/credit card.csv')#.head(10000)
X = data.drop(columns = ['default.payment.next.month'])
y = data['default.payment.next.month'].values
print(data.shape)

(30000, 25)


In [17]:
time_start = datetime.now()

svc = SVC(kernel = 'rbf')
svc.fit(X, y)

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 35 seconds


In [18]:
time_start = datetime.now()

model = SVC(kernel = 'rbf')
params = {
    'C': [0.01, 1, 100]
}
clf = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    cv=5
)
clf.fit(X, y)

best_model = SVC(**clf.best_params_)
best_model.fit(X, y)

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 1089 seconds


## 3.3 Muti-thread SkLearn

In [23]:
from joblib import parallel_backend

time_start = datetime.now()
with parallel_backend('threading', n_jobs = 8):
    model = SVC(kernel = 'rbf')
    params = {
        'C': [0.01, 1, 100]
    }
    clf = GridSearchCV(
        estimator=model, 
        param_grid=params, 
        cv=5
    )
    clf.fit(X, y)

    best_model = SVC(**clf.best_params_)
    best_model.fit(X, y)

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 73 seconds


In [24]:
from joblib import parallel_backend

time_start = datetime.now()
with parallel_backend('threading', n_jobs = 4):
    model = SVC(kernel = 'rbf')
    params = {
        'C': [0.01, 1, 100]
    }
    clf = GridSearchCV(
        estimator=model, 
        param_grid=params, 
        cv=5
    )
    clf.fit(X, y)

    best_model = SVC(**clf.best_params_)
    best_model.fit(X, y)

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

TOTAL RUNTIME = 64 seconds


# 4. Torch

In [19]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm import tqdm

class MLP(nn.Module):
    def __init__(self, input_size = 64, hidden_size = 128, output_size = 10):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [20]:
mlp = MLP()
X = torch.randint(1000, (10000, 64)).float()
y = torch.randint(10, (10000,))

lr = 0.001
n_epoch = 20

optimizer = torch.optim.Adam(mlp.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [21]:
time_start = datetime.now()

mlp.train()
for epoch in tqdm(range(n_epoch)):
    loss = 0
    for i in range(10000):
        mlp.zero_grad()
        optimizer.zero_grad()
        inp, target = X[[i]], y[[i]]
        output = mlp(inp)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

time_end = datetime.now()
print(f'TOTAL RUNTIME = {(time_end - time_start).seconds} seconds')

100%|██████████| 20/20 [01:36<00:00,  4.81s/it]

TOTAL RUNTIME = 96 seconds



