In [1]:
import numpy as np
import collections
from collections import defaultdict
import random

### 问题
对一个数组 nums 和一个整数 k ，返回其中出现频率前 k 高的元素

### Demo Case

In [7]:
nums = [1,1,1,1,1,2,2,3,3,3,3,3,3,3,6,6,6,6,4,4,4,4,4,4,9,7,7,7,7]
k = 3

- 统计不重复元素的频数  $O(n)$

In [8]:
stat = collections.Counter(nums)
stat = list(stat.items()) #频次列表
stat

[(1, 5), (2, 2), (3, 7), (6, 4), (4, 6), (9, 1), (7, 4)]

### 1.基于堆（小顶堆）
时间复杂度：$O(nlogk)$
空间复杂度：$O(k)$

In [4]:
#取父/子节点基本操作
def lchild(node):
    return node << 1
def rchild(node):
    return node << 1 | 1
def father(node):
    return node >> 1
#上浮节点，用于向堆插入新节点
def heap_up(heap,node):
    val = heap[node]
    while father(node)>0 and val[1]<heap[father(node)][1]:
        heap[node] = heap[father(node)]
        node = father(node)
    heap[node] = val
#下沉节点，用于调整堆
def heap_down(heap,node,k):
    root = node #root作为变量逐步下沉
    val = heap[node] #存储node原值
    while lchild(root) <= k:
        child = lchild(root) #先选取左子节点
        if rchild(root) <= k and child < rchild(root): #如果右更小，选取右子节点
            child = rchild(root)
        #验证确实当前节点值大于选取的子节点，则交换
        if heap[child][1] < val[1]:
            heap[root] = heap[child]
            root = child
        else: #否则则找到位置，结束循环
            break
    heap[root] = val #最后赋值
#堆排序输出
def heap_sort(heap):
    for i in range(len(heap)-1, 0, -1):
        heap[1], heap[i] = heap[i], heap[1]
        heap_down(heap, 1, i)
#主函数
def HeapFrequent(nums,k,stat):
    #stat = collections.Counter(nums)
    #stat = list(stat.items())
    heap = [(0,0)] #占位上界
    #使用heap_up()建堆 规模：k+1(包括占位上界) 
    for i in range(k):
        heap.append(stat[i])
        heap_up(heap, len(heap)-1)
    #使用heap_down()维护堆 新元素大于堆顶即下沉
    for i in range(k,len(stat)):
        if stat[i][1] > heap[1][1]: #heap[1]为堆顶
            heap[1] = stat[i] #去除原根（堆顶）节点
            heap_down(heap,1,k) #将node下沉
    #使用heap_sort()排序，并倒序得到从大到小结果
    heap_sort(heap)
    result = heap[1:][::-1]
    return result

In [5]:
HeapFrequent(nums,k,stat)

[(3, 7), (4, 6), (1, 5)]

### 2.基于桶排序
哈希空间集中时，时间、空间复杂度均接近$O(n)$ 
分散时则增添很多无谓循环开销

In [4]:
def BucketFrequent(nums,k,spacesize,stat):
    #stat = collections.Counter(nums)
    #stat = list(stat.items())
    Bucket = [[] for i in range(spacesize)]
    for s in stat:
        Bucket[s[1]].append(s)
        
    r = 0
    flag = False
    result = []
    for i in range(len(Bucket)-1,-1,-1):
        if not flag and Bucket[i] != []:
            for s in Bucket[i]:
                result.append(s)
                r += 1
                if r == k:
                    flag = True
                    break
    return result

In [5]:
space = 10
BucketFrequent(nums,k,space,stat)

[(3, 7), (4, 6), (1, 5)]

In [43]:
def BucketElement(nums,k,spacesize):
    stat_dict = defaultdict(list)
    for i,s in enumerate(nums):
        stat_dict[s].append(i)
    stat_keys = list(stat_dict.keys())
    stat = [(key,len(stat_dict[key])) for key in stat_keys]
    frequent = BucketFrequent(nums,k,spacesize,stat)
    print(frequent)
    elements = [(freq[0],freq[1],stat_dict[freq[0]]) for freq in frequent]
    print(elements)
    #print(stat)
    #Bucket = [[] for i in range(spacesize)]
    
    #print(stat_dict)
    #stat_keys = list(stat_dict.keys())
    #stat_keys.sort(reverse=True)
    #result = []
    #for i in range(k):
    #    result.append((stat_keys[i],stat_dict[stat_keys[i]]))
    #return result

In [44]:
BucketElement(nums,k,space)

[(3, 7), (4, 6), (1, 5)]
[(3, 7, [7, 8, 9, 10, 11, 12, 13]), (4, 6, [18, 19, 20, 21, 22, 23]), (1, 5, [0, 1, 2, 3, 4])]


In [19]:

stat_dict = defaultdict(list)
for i,s in enumerate(nums):
    stat_dict[s].append(i)

In [22]:
stat_keys = list(stat_dict.keys())
stat_keys.sort(reverse=True)

In [23]:
for i in range(k):
    result.append(stat_keys[i],stat_dict[stat_keys[i]])
return result

[9, 7, 6, 4, 3, 2, 1]

In [3]:
np.random.choice(, 20, replace=False)

0.5088196162299449