# Numpy challenge

In [97]:
import numpy as np
import numpy_indexed as npi
from collections import defaultdict
import pandas as pd
from datetime import datetime

Challenge: Return voor elk element het hoeveelste element dat is so far in de array

In [98]:
# example
inp = np.array([7, 7, 7, 8, 7, 8, 8, 9, 9])
outp = np.array([1, 2, 3, 1, 4, 2, 3, 1, 2])

In [99]:
# for loop solution
def for_loop_list(inp):
    counts_so_far = defaultdict(int)
    outp = []
    for i in inp:
        counts_so_far[i] += 1
        outp.append(counts_so_far[i])
        
    return np.array(outp)

for_loop_list(inp)

array([1, 2, 3, 1, 4, 2, 3, 1, 2])

In [100]:
# crazy pandas solution
def pandas_rank(inp):
    df = pd.DataFrame({'u':inp, 'i':inp})
    oui = df.groupby("u")['i'].rank("first")
    return oui.values
    
pandas_rank(inp)

array([1., 2., 3., 1., 4., 2., 3., 1., 2.])

In [101]:
# your submission
def custom_oui(inp):
    return cumcount(inp) + 1

In [102]:
from numba import jit
from numba import types
from numba.typed import Dict

@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
def for_loop_list_jit(inp):
    counts_so_far = Dict.empty(
        key_type=types.int32,
        value_type=types.int32,
    )
    outp = []
    for i in inp:
        if i in counts_so_far:
            counts_so_far[i] += 1
        else:
            counts_so_far[i] = 1
        outp.append(counts_so_far[i])
        
    return outp

for_loop_list_jit(inp)

[1, 2, 3, 1, 4, 2, 3, 1, 2]

In [117]:
@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
def cumcount_jit(a):
    n = a.size
    s = np.argsort(a, kind='mergesort')
    
    # argunsort
    i = np.empty(n, dtype=np.int32)
    i[s] = np.arange(n)
    
    b = a[s]
    
    # dfill
    bn = b.size
    where = np.where(b[:-1] != b[1:])[0] + 1
    c = np.array([0] + list(where) + [bn])
    c = np.arange(bn)[c[:-1]].repeat(np.diff(c))
    
    outp = (np.arange(n) - c)[i]
    
    return outp + 1

a = cumcount_jit(inp)
a

array([1, 2, 3, 1, 4, 2, 3, 1, 2], dtype=int64)

# Sandbox

In [104]:
def dfill(a):
    n = a.size
    b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
    return np.arange(n)[b[:-1]].repeat(np.diff(b))

def argunsort(s):
    n = s.size
    u = np.empty(n, dtype=np.int64)
    u[s] = np.arange(n)
    return u

def cumcount(a):
    n = a.size
    s = a.argsort(kind='mergesort')
    i = argunsort(s)
    b = a[s]
    return (np.arange(n) - dfill(b))[i]

def foo(l):
    n = len(l)
    r = np.empty(n, dtype=np.int64)
    counter = defaultdict(int)
    for i in range(n):
        counter[l[i]] += 1
        r[i] = counter[l[i]]
    return r - 1

def div(l):
    a = np.unique(l, return_counts=1)[1]
    idx = a.cumsum()
    id_arr = np.ones(idx[-1],dtype=int)
    id_arr[0] = 0
    id_arr[idx[:-1]] = -a[:-1]+1
    rng = id_arr.cumsum()
    return rng[argunsort(np.argsort(l))]

cumcount(inp)

array([0, 1, 2, 0, 3, 1, 2, 0, 1])

# testing

In [105]:
# verify if it works
assert((for_loop_list(inp) == outp).all())
assert((pandas_rank(inp) == outp).all())
assert((custom_oui(inp) == outp).all())

assert((for_loop_list_jit(inp) == outp).all())
assert((cumcount_jit(inp) == outp).all())

# Timing

In [106]:
time_inp = np.random.randint(1, 100, size=(5000))
time_inp

array([10, 77, 97, ..., 37, 27, 67])

In [112]:
st = datetime.now()

for i in range(5000):
    for_loop_list(time_inp)
    
print("For loop scored:", datetime.now() - st)

For loop scored: 0:00:07.918176


In [113]:
st = datetime.now()

for i in range(5000):
    pandas_rank(time_inp)
    
print("Pandas scored:", datetime.now() - st)

Pandas scored: 0:00:03.630545


In [114]:
st = datetime.now()

for i in range(5000):
    custom_oui(time_inp)
    
print("Nerd cumcount scored:", datetime.now() - st)

Nerd cumcount scored: 0:00:01.284898


In [115]:
st = datetime.now()

for i in range(5000):
    for_loop_list_jit(time_inp)
    
print("For loop optimized jit scored:", datetime.now() - st)

For loop optimized jit scored: 0:00:01.267618


In [116]:
st = datetime.now()

for i in range(5000):
    cumcount_jit(time_inp)
    
print("Cumcount optimized jit scored:", datetime.now() - st)

Cumcount optimized jit scored: 0:00:01.359427


# Implementation

In [119]:
def dfill(a):
    n = a.size
    b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
    return np.arange(n)[b[:-1]].repeat(np.diff(b))

def argunsort(s):
    n = s.size
    u = np.empty(n, dtype=np.int64)
    u[s] = np.arange(n)
    return u

def cumcount(a):
    n = a.size
    s = a.argsort(kind='mergesort')
    i = argunsort(s)
    b = a[s]
    return (np.arange(n) - dfill(b))[i]

In [167]:
def add_oui_and_oiu(edges, edges_t):
    # prepare arrays
    oui = np.zeros_like(edges_t)
    oiu = np.zeros_like(edges_t)
    
    # sort by time
    trans_order = np.argsort(edges_t)
    
    # oui = user's xth transaction, so the cumcount of that users occurence
    sorted_users = edges[0, :][trans_order]
    oui[trans_order] = cumcount(sorted_users) + 1
    
    # oiu = item's xth transaction, so the cumcount of that items occurence
    sorted_items = edges[1, :][trans_order]
    oiu[trans_order] = cumcount(sorted_items) + 1
    
    return oui, oiu
    
add_oui_and_oiu(edges, edges_t)

(array([1, 3, 2, 2, 1, 1, 4]), array([1, 2, 2, 1, 1, 2, 1]))

In [164]:
edges = np.array([[1, 1, 1, 2, 2, 3, 1],
        [6, 7, 8, 8, 7, 6, 9]])
edges_t = np.array([5, 16, 15, 14, 13, 12, 19])

trans_order = np.argsort(edges_t)
sorted_users

array([1, 3, 2, 2, 1, 1, 1])

In [165]:
oui = np.zeros_like(edges_t)
sorted_users = edges[0, :][trans_order]
oui[trans_order] = cumcount(sorted_users) + 1
oui

array([1, 3, 2, 2, 1, 1, 4])

In [166]:
oiu = np.zeros_like(edges_t)
sorted_items = edges[1, :][trans_order]
oiu[trans_order] = cumcount(sorted_items) + 1
oiu

array([1, 2, 2, 1, 1, 2, 1])

In [146]:
cumcount(sorted_users) + 1

array([1, 1, 1, 2, 2, 3, 4])

In [144]:
edges[:, trans_order]

array([[1, 3, 2, 2, 1, 1, 1],
       [9, 6, 7, 8, 8, 7, 6]])

In [147]:
edges

array([[1, 1, 1, 2, 2, 3, 1],
       [6, 7, 8, 8, 7, 6, 9]])

In [151]:
oui

array([4, 3, 2, 2, 1, 1, 1])

In [174]:
import torch
import torch.nn as nn

In [175]:
emb = nn.Embedding(5, 16)
emb

Embedding(5, 16)

In [176]:
emb(torch.tensor(1))

tensor([ 0.3162, -0.1474,  0.4914,  0.9304, -0.9844,  0.7447,  1.7171, -0.4744,
         1.1330, -0.4233, -1.0334, -0.1280,  1.6228, -0.8561,  0.7262, -0.2493],
       grad_fn=<EmbeddingBackward0>)

In [179]:
emb.weight[1]

tensor([ 0.3162, -0.1474,  0.4914,  0.9304, -0.9844,  0.7447,  1.7171, -0.4744,
         1.1330, -0.4233, -1.0334, -0.1280,  1.6228, -0.8561,  0.7262, -0.2493],
       grad_fn=<SelectBackward0>)

In [180]:
from pprint import pprint

In [None]:
pprint