In [108]:
import numpy as np
import pandas as pd
import time
import binascii
from Crypto import Random

### Generate long arrays of random strings

In [23]:
example = []
for i in range(10000):
    example.append(binascii.hexlify(Random.get_random_bytes(20)).decode())
example = np.array(example*100)

In [24]:
len(example)

1000000

#### iteration vs numpy string operation

In [129]:
def iter_encode(example):
    return np.array([i.encode("ascii") for i in example])

def numpy_encode(example):
    return np.char.encode("ascii")

def pandas_encode(example):
    f = lambda x: x.encode("ascii")
    return pd.DataFrame(example, columns=["A"])["A"].apply(f)

def iter_strip(example):
    return np.array([i.strip("\n") for i in example])

def numpy_strip(example):
    return np.char.strip(example, "\n")


def iter_upper(example):
    return np.array([i.upper() for i in example])

def numpy_upper(example):
    return np.char.upper(example)

In [82]:
print("list iterations")
t0 = time.time()
iter_strip(example)
print("strip", time.time()-t0)
t0 = time.time()
iter_encode(example)
print("encode", time.time()-t0)
t0 = time.time()
iter_upper(example)
print("upper", time.time()-t0)

list iterations
strip 0.745664119720459
encode 0.5191471576690674
upper 0.6654129028320312


In [131]:
print("numpy operations")
t0 = time.time()
numpy_strip(example)
print("strip", time.time()-t0)
t0 = time.time()
numpy_encode(example)
print("encode", time.time()-t0)
t0 = time.time()
numpy_upper(example)
print("upper", time.time()-t0)

numpy operations
strip 0.6855630874633789
encode 6.29425048828125e-05
upper 0.47653985023498535


In [132]:
print("pandas operations")
t0 = time.time()
pandas_encode(example)
print("encode", time.time()-t0)

pandas operations
encode 0.5210232734680176


### Handling ascii Encoding

In [139]:
varied_strings = np.array(
                          #["我们今天心情好"]*1 +
                          ["we are in good mood"]*1000000)

In [140]:
def pandas_apply(example):
    df = pd.DataFrame(example, columns=["A"])
    f = lambda x: x.encode("ascii", errors="ignore")
    return df["A"].apply(f).drop_duplicates()

def numpy_char(example):
    return np.unique(np.char.encode(example, "ascii", errors="ignore"))

In [144]:
print("pandas apply")
t0 = time.time()
pandas_apply(example)
print(time.time()-t0)

pandas apply
0.8134620189666748


In [145]:
print("numpy char")
t0 = time.time()
numpy_char(example)
print(time.time()-t0)

numpy char
1.2652158737182617
