In [1]:
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import codecs
import os
import re
import matplotlib.pyplot as plt 
 
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
 
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

## 计算每条文本的词频数据框 

In [2]:
from collections import Counter
with open("mi10_user_retweet_fc.txt" , 'r' , encoding='utf-8') as f:
    all_text = f.read()
all_text = all_text.split('\n')
data = pd.DataFrame(columns = [0 , 1])
counts = []
for text in all_text:
    words = text.split(' ')
    word_freq = Counter(words)
    word_count = dict(Counter(words))
    every_df = pd.DataFrame([pd.Series(word_count.keys()) , pd.Series(word_count.values())]).T
    every_df.columns = ["词语" , "词频"]
    every_df = every_df.iloc[:len(every_df)-1 , :]
    counts.append(every_df)
counts

[      词语 词频
 0     图像  2
 1    传感器  2
 2     需求  1
 3     强劲  1
 4      找  1
 5     台积  1
 6      电  1
 7     代工  1
 8     手机  4
 9     拍照  3
 10    媲美  1
 11    行业  1
 12    常态  1
 13    像素  1
 14    表现  1
 15    厂商  2
 16     新  1
 17    领域  1
 18    新品  1
 19   发布会  1
 20    肌肉  1
 21    搬出  1
 22   排行榜  1
 23     商  1
 24    热销  1
 25    全球  1
 26  市场份额  1
 27    处于  1
 28  领先地位  1,
       词语 词频
 0     儿童  2
 1     用电  2
 2    儿童节  1
 3     来临  1
 4     提醒  1
 5     家长  2
 6     居家  1
 7     守护  1
 8     祖国  1
 9     花朵  1
 10    日记  1
 11    远离  2
 12    插座  4
 13    电器  1
 14    教导  1
 15   小朋友  1
 16    电线  2
 17    玩耍  1
 18    发生  1
 19    水杯  1
 20    放在  1
 21    将水  1
 22    小心  1
 23     洒  1
 24  电源插座  1
 25    短路  1
 26    触电  1,
    词语 词频
 0  实施  1
 1  能源  2
 2  制裁  1
 3  全球  1
 4  价格  1
 5  上涨  1,
     词语 词频
 0   汽油  2
 1   价格  3
 2   飙高  1
 3    飞  1
 4    高  1
 5    跌  1
 6    深  1
 7   主席  1
 8   停止  1
 9   超过  1
 10  石油  1
 11  标牌  1
 12  下跌  1,
     词语 词频
 0   国家

## 定义SimHash类（包含求SimHash值与海明距离计算）

In [3]:
import jieba
import jieba.analyse
import numpy as np

def simHash(keyWords):
    keyList = []
    for feature, weight in keyWords:
        # print('feature:' + feature)
#         print('weight: {}'.format(weight))
        # weight = math.ceil(weight)
        weight = int(weight)
        binstr = string_hash(feature)
#         print('feature: %s , string_hash %s' % (feature, binstr))
        temp = []
        for c in str(binstr):
            if (c == '1'):
                temp.append(weight)
            else:
                temp.append(-weight)
        keyList.append(temp)
#     print(np.sum(np.array(keyList), axis=0))
    listSum = np.sum(np.array(keyList), axis=0)
    if (keyList == []):
        return '00'
    simhash = ''
    for i in listSum:
        if (i > 0):
            simhash = simhash + '1'
        else:
            simhash = simhash + '0'
    return simhash

def string_hash(source):
    if source == "":
        return 0
    else:
        temp = source[0]
        temp1 = ord(temp)
        x = ord(source[0]) << 7
        m = 1000003
        mask = 2 ** 128 - 1
        for c in source:
            x = ((x * m) ^ ord(c)) & mask
        x ^= len(source)
        if x == -1:
            x = -2
        x = bin(x).replace('0b', '').zfill(64)[-64:]

        return str(x)

def getDistance(hashstr1, hashstr2):
    '''
        计算两个simhash的汉明距离
    '''
    length = 0
    for index, char in enumerate(hashstr1):
        if char == hashstr2[index]:
            continue
        else:
            length += 1
    return length

## 计算每条文本的SimHash值 

In [4]:
keyWords = []
hash_value = []
for i in range(len(counts)):
    for element in zip(list(counts[i]["词语"]) , list(counts[i]["词频"])):
        keyWords.append(element)
    hash_value.append(simHash(keyWords))
hash_value

['0010110010101010111000111110111111111001001100010101010000010011',
 '0010110101101100100001101110110111010000001101011111100000011011',
 '0010110101101100100001101110111111010001001101111111110000010011',
 '0010010101001000110001101110111111010001011111111101110000011011',
 '0010010100001000110001101100111111010001010101111100110100011011',
 '0010010100001000110001101100110111010001010100111100010100011011',
 '0010010100000000110001001100111111010001010100111100110100010011',
 '0010000100000000110001001100111111010001011100111100110100010011',
 '0010000100000000010001001100111111010001001101111101110100010011',
 '0010000100001000110001011101111111010001001110111101010100110011',
 '0010000100001000010001111100111111010001001111111101110100111011',
 '0010010100001000010001011100101111010001001001111100110110111111',
 '0010010101001000010001011100101111010001001101111100110110111111',
 '0010010000001000010001011100111111010001011010111100010110111011',
 '00100100000010000100010111001111

## 利用海明距离大小来判别两条文本的相似度 

In [5]:
remove_num = []
for i in range(len(hash_value)):
    remove = []
    for j in range(len(hash_value)):
        hashstr1 = hash_value[i]
        hashstr2 = hash_value[j]
        # 添加两个哈希值的海明距离到一个列表中，后面找出这个列表中海明距离小于设定值的索引（重复文本的索引）
        remove.append(getDistance(hashstr1, hashstr2))
        
    index_list = np.arange(0 , len(hash_value))
    for duplicate in index_list[np.array(remove) < 1]:
        # 避免与前面以及自身重复
        if duplicate > i:
            # 添加到重复索引列表中
            remove_num.append(duplicate)
np.array(remove_num)

array([ 14,  20,  21, ..., 997, 998, 998])