In [1]:
import numpy as np
import pandas as pd
import time
import binascii
from Crypto import Random

### Generate long arrays of random strings

In [2]:
example = []
for i in range(10000):
    example.append(binascii.hexlify(Random.get_random_bytes(20)).decode())
example = np.array(example*100)

In [3]:
len(example)

1000000

#### iteration vs numpy string operation

In [29]:
def iter_encode(example):
    return np.array([i.encode("ascii") for i in example])

def numpy_encode(example):
    return np.char.encode(example, encoding="ascii")

def pandas_encode(example):
    f = lambda x: x.encode("ascii")
    return pd.DataFrame(example, columns=["A"])["A"].apply(f).values

def iter_strip(example):
    return np.array([i.strip("\n") for i in example])

def numpy_strip(example):
    return np.char.strip(example, "\n")

def pandas_strip(example):
    f = lambda x: x.strip("\n")
    return pd.DataFrame(example, columns=["A"])["A"].apply(f).values

def iter_upper(example):
    return np.array([i.upper() for i in example])

def numpy_upper(example):
    return np.char.upper(example)

def pandas_upper(example):
    return pd.DataFrame(example, columns=["A"])["A"].apply(lambda x: x.upper()).values

In [27]:
print("list iterations")
t0 = time.time()
iter_strip(example)
print("strip", time.time()-t0)
t0 = time.time()
iter_encode(example)
print("encode", time.time()-t0)
t0 = time.time()
iter_upper(example)
print("upper", time.time()-t0)

list iterations
strip 0.7352917194366455
encode 0.5209448337554932
upper 0.6873610019683838


In [24]:
print("pandas apply")
t0 = time.time()
pandas_strip(example)
print("strip", time.time()-t0)
t0 = time.time()
pandas_encode(example)
print("encode", time.time()-t0)
t0 = time.time()
pandas_upper(example)
print("upper", time.time()-t0)

pandas apply
strip 0.3976278305053711
encode 0.4859640598297119
upper 0.42212510108947754


In [30]:
print("numpy operations")
t0 = time.time()
numpy_strip(example)
print("strip", time.time()-t0)
t0 = time.time()
numpy_encode(example)
print("encode", time.time()-t0)
t0 = time.time()
numpy_upper(example)
print("upper", time.time()-t0)

numpy operations
strip 0.7155959606170654
encode 0.6981050968170166
upper 0.4615192413330078


### Handling ascii encoding error

In [44]:
varied_strings = np.array(
                          ["我们今天心情好"]*1000000 +
                          ["we are in good mood"]*1000000
                         )

In [41]:
def pandas_apply(example):
    df = pd.DataFrame(example, columns=["A"])
    f = lambda x: x.encode("ascii", errors="ignore")
    return df["A"].apply(f)

def numpy_char(example):
    return np.char.encode(example, encoding="ascii", errors="ignore")

In [42]:
print("pandas apply")
t0 = time.time()
pandas_apply(example)
print(time.time()-t0)

pandas apply
0.7092790603637695


In [43]:
print("numpy char")
t0 = time.time()
numpy_char(example)
print(time.time()-t0)

numpy char
0.9797930717468262
