In [1]:
import json
import jieba
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.externals import joblib
from sklearn import preprocessing
import numpy as np

import TfIdf_SinglePass

## 测试4k个feature

### 得到tf-idf

In [7]:
def get_tf_idf_of_rumor(features_num=4000):
    corpus = []
    with open('file/corpus/cut_corpus_of_rumor.txt', 'r') as src:
        lines = src.readlines()
        for line in lines:
            corpus.append(line)
        print('The size of corpus is {}'.format(len(corpus)))

    vectorizer = CountVectorizer(max_features=features_num)
    transformer = TfidfTransformer()
    tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    vocabulary = vectorizer.get_feature_names()

    joblib.dump((vocabulary, tf_idf), 'file/pkl/tf_idf_of_rumor_4000.pkl')

get_tf_idf_of_rumor()

The size of corpus is 34611


In [3]:
vocabulary, tf_idf = joblib.load('file/pkl/tf_idf_of_{}.pkl'.format('truth_4_fold'))
tf_idf_array = preprocessing.normalize(tf_idf.toarray(), norm='l2')

In [12]:
# (166,3) -> 29
# (2734, 12) -> 8954
# (2340, 40) -> 13286

# (7, 30) -> 66
# (395, 23) -> 2695

In [4]:
tf_idf_array.shape

(38180, 4000)

In [10]:
'amp' in vocabulary

True

In [8]:
def _cosine_similarity(vec_a, vec_b):
    # vec_a (1, dim)-array
    # vec_b (dim, 1)-array
    return float(vec_a.dot(vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)))

_cosine_similarity(tf_idf_array[66], tf_idf_array[2695])

0.6393048350717826

### 查看聚类效果

In [18]:
def _clustering(category, threshold=0.6):
    _, tf_idf = joblib.load('file/pkl/tf_idf_of_{}.pkl'.format(category))
    tf_idf_array = preprocessing.normalize(tf_idf.toarray(), norm='l2')

    single_pass_cluster = TfIdf_SinglePass.SinglePassCluster(tf_idf_array[:1000], t=threshold)
    joblib.dump(single_pass_cluster, 'file/pkl/tf_idf_{}_clustering.pkl'.format(category))

In [22]:
# _clustering('rumor_4000')

In [20]:
def test_on_client(category):
    single_pass_cluster = joblib.load('file/pkl/tf_idf_{}_clustering.pkl'.format(category))
    cluster_list = single_pass_cluster.cluster_list
    with open('file/corpus/corpus_of_rumor.txt', 'r') as src:
        with open('file/test_{}.txt'.format(category), 'w') as out:
            lines = src.readlines()
            for cluster in cluster_list:
                for i in cluster.node_list:
                    out.write('{}'.format(lines[i]))
                out.write('-----------------------------------\n')

In [21]:
test_on_client('rumor_4000')

# 真实微博

In [2]:
import json
import jieba
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.externals import joblib
from sklearn import preprocessing
import numpy as np
import random

import TfIdf_SinglePass

In [4]:
# single_pass_cluster = joblib.load('file/pkl/tf_idf_{}_clustering.pkl'.format('truth_4000'))
# cluster_list = single_pass_cluster.cluster_list

In [71]:
def gen_filtered_truth():
#     single_pass_cluster = joblib.load('file/pkl/tf_idf_{}_clustering.pkl'.format('truth_4000'))
#     cluster_list = single_pass_cluster.cluster_list

#     out = open('file/weibo_truth_text_filtered.json', 'w', encoding='utf-8')
#     out_pretty = open('file/weibo_truth_text_filtered_pretty.json', 'w', encoding='utf-8')
    with open('../weibo_truth_analysis/file/weibo_truth.txt', 'r', encoding='utf-8') as src:
        events = src.readlines()
    
    valid_cluster_num = 0
    missing_pics_cluster_num = 0
    with open('file/corpus/corpus_of_truth.txt', 'r', encoding='utf-8') as corpus:
        lines = corpus.readlines()
        
        for cluster in cluster_list:
            truth_weibos = []
            
            for index in cluster.node_list:
                line = lines[index]
                event_index = int(line.split(',')[0][1:])
                weibo_index = int(line.split(',')[1].split(')')[0][1:])

                event = json.loads(events[event_index], encoding='utf-8')
                truth_weibo = event['weibo'][weibo_index]
                
                if 'piclist' in truth_weibo.keys() and isinstance(truth_weibo['piclist'], list):
                    if len(truth_weibo['piclist']) != 0:
                        # 添加_position字段
                        truth_weibo['_position'] = (event_index, weibo_index)
                        truth_weibos.append(truth_weibo)
            
            # 选取规则：在有图片的微博中随机取
            if len(truth_weibos) <= 0:
                missing_pics_cluster_num += 1
                continue
            valid_cluster_num += 1
            chosen_truth = random.sample(truth_weibos, 1)[0]
            
#             out.write('{}\n'.format(json.dumps(chosen_truth, ensure_ascii=False)))
#             out_pretty.write('{}\n'.format(json.dumps(chosen_truth, ensure_ascii=False, indent=4, separators=(',', ':'))))
#             out.flush()
#             out_pretty.flush()
#     out.close()
#     out_pretty.close()

    print('有效的簇为{}个，缺少图片的簇为{}个'.format(valid_cluster_num, missing_pics_cluster_num))

In [72]:
gen_filtered_truth()

有效的簇为36827个，缺少图片的簇为20441个


In [63]:
def show_clustering_truth():
#     single_pass_cluster = joblib.load('file/pkl/tf_idf_{}_clustering.pkl'.format('rumor_4000'))
#     cluster_list = single_pass_cluster.cluster_list

    # 输出过滤后的文本内容
    with open('file/corpus/corpus_of_truth.txt', 'r', encoding='utf-8') as src:
        with open('file/truth_4000.txt', 'w', encoding='utf-8') as out:
            lines = src.readlines()
            for cluster in cluster_list:
                for i in cluster.node_list:
                    out.write('{}'.format(lines[i]))
                out.write('-----------------------------------\n')
                
    # 统计微博数量、图片数量、userCertify分布
    with open('file/weibo_truth_text_filtered.json', 'r', encoding='utf-8') as src:
        lines = src.readlines()
        filtered_weibo_num = len(lines)
        filtered_pic_num = 0
        certify_0 = 0
        certify_1 = 0
        certify_2 = 0

        for line in lines:
            truth = json.loads(line, encoding='utf-8-sig')
            filtered_pic_num += len(truth['piclist'])
            if 'userCertify' in truth.keys():
                certify = truth['userCertify']
                if certify == 0:
                    certify_0 += 1
                elif certify == 1:
                    certify_1 += 1
                else:
                    certify_2 += 1

        print('聚类后的真实微博：数量为{}，图片数量为{}'.format(filtered_weibo_num, filtered_pic_num))
        print('（{}）{}:{}:{} = {:.1f} : {:.1f} : 1'.format(
            certify_0 + certify_1 + certify_2, certify_0, certify_1,
            certify_2, certify_0 / certify_2, certify_1 / certify_2))

In [64]:
show_clustering_truth()

聚类后的真实微博：数量为36827，图片数量为79076
（36827）13967:6581:16279 = 0.9 : 0.4 : 1


In [65]:
len(cluster_list)

57268