In [56]:
import re
import os
import numpy as np
import pandas as pd
import jieba
import jieba.analyse as ja
import matplotlib.pyplot as plt
from tqdm import tqdm

import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors

In [None]:
import os
import jieba
import jieba.posseg
from operator import itemgetter

_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
                                                 os.path.dirname(__file__), path))
_get_abs_path = jieba._get_abs_path

DEFAULT_IDF = _get_module_path("idf.txt")


# class KeywordExtractor(object):

#     STOP_WORDS = set((
#         "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
#         "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
#         "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
#     ))

#     def set_stop_words(self, stop_words_path):
#         abs_path = _get_abs_path(stop_words_path)
#         if not os.path.isfile(abs_path):
#             raise Exception("jieba: file does not exist: " + abs_path)
#         content = open(abs_path, 'rb').read().decode('utf-8')
#         for line in content.splitlines():
#             self.stop_words.add(line)

#     def extract_tags(self, *args, **kwargs):
#         raise NotImplementedError


class IDFLoader(object):

    def __init__(self, idf_path=None):
        self.path = ""
        self.idf_freq = {}
        self.median_idf = 0.0
        if idf_path:
            self.set_new_path(idf_path)

    def set_new_path(self, new_idf_path):
        if self.path != new_idf_path:
            self.path = new_idf_path
            content = open(new_idf_path, 'rb').read().decode('utf-8')
            self.idf_freq = {}
            for line in content.splitlines():
                word, freq = line.strip().split(' ')
                self.idf_freq[word] = float(freq)
            self.median_idf = sorted(
                self.idf_freq.values())[len(self.idf_freq) // 2]

    def get_idf(self):
        return self.idf_freq, self.median_idf


class TFIDF(KeywordExtractor):

    def __init__(self, idf_path=None):
        self.tokenizer = jieba.dt
        self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
        self.idf_freq, self.median_idf = self.idf_loader.get_idf()

    def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        """
        Extract keywords from sentence using TF-IDF algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
                        if the POS of w is not in this list,it will be filtered.
            - withFlag: only work with allowPOS is not empty.
                        if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        if allowPOS:
            allowPOS = frozenset(allowPOS)
            words = self.postokenizer.cut(sentence)
        else:
            words = self.tokenizer.cut(sentence)
        freq = {}
        for w in words:
            if allowPOS:
                if w.flag not in allowPOS:
                    continue
                elif not withFlag:
                    w = w.word
            wc = w.word if allowPOS and withFlag else w
            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
                continue
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:
            kw = k.word if allowPOS and withFlag else k
            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

        if withWeight:
            tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(freq, key=freq.__getitem__, reverse=True)
        if topK:
            return tags[:topK]
        else:
            return tags

In [74]:
class werain_v1:
    def __init__(self):
#         self.path = path
        self.all_text = self.load_data()
        self.clean_data = None
    
    def load_data(self):
        return pd.read_csv('all_text.csv')
    
    def get_text_pro(self):
        """
            仅仅获得原创文本
        """
        
        print('【获得原创文本】：\n')
        all_text = self.all_text
        print('all_text所有文本的数目为：', len(all_text))
        df_text = all_text[all_text['length_retweet_text']==0][['length_text','text_pro']].reset_index(drop=True)
        df_text.drop_duplicates(subset='text_pro',inplace=True)
        df_text = df_text.reset_index(drop=True)
        print('原创微博文本的数目为：', len(df_text))
        print(df_text.head(3),'\n\n')
        return df_text
    
    def get_all_text(self):
        """
            获得全部文本，合并原创文本text_pro和转发文本retweet_text_pro
        """
        tmp = self.all_text
        tmp = tmp.fillna('')
        text = list()
        for i in range(len(tmp['text_pro'].to_list())):
            text.append(tmp['text_pro'][i] +" "+ tmp['retweet_text_pro'][i])
        tmp['text'] = text
        return tmp
    
    def clean_one_text(self,text):
        """
            清洗单条文本
        """
        pattern_special = r'//@.*:|//@.*：|(来自.*)|@.*\s|http://.*|\[(.*?)\]'
        pattern_text = r'分享网易新闻|分享|转载|微博调查|我发起了一个投票'
        
        # 1. 删除由特殊格式限定的广告、用户名、链接等
        x2 = re.sub(pattern_special," ",text)

        # 2. 仅保留中文
        pattern = re.compile(r'[\u4e00-\u9fa5]+')
        filedata = re.findall(pattern, x2)
        x3 = ''.join(filedata)

        # 3. 去掉微博评论特有的无效信息
        new_xx = re.sub(pattern_text,'',x3)
        return new_xx
    
    def data_clean(self):
        
        
        # （1）获得原创文本
        # text = self.get_text_pro()['text_pro']
        
        # （1）获得全部文本，合并原创文本text_pro和转发文本retweet_text_pro
        print('【获得全部文本】：')
        text = self.get_all_text()['text']
        print('全部微博文本的数目为：', len(text),"\n")
        
        # （2）短文本清洗，过滤无效字符
        print('【短文本清洗】：')
        new_text = [ self.clean_one_text(text[i]) for i in range(len(text))]
        df_new_text = pd.DataFrame(data = [len(new_text[j]) for j in range(len(new_text))],columns=['length_text'])
        df_new_text['new_text'] = new_text
        print(df_new_text.head(3),'\n')
        
        # （3）删除字符数小于15的短文本
        print('【删除过短文本】：')
        print("删除短文本前评论数目：",len(df_new_text['length_text']))
        new_text = df_new_text[df_new_text['length_text']>=15]['new_text'].reset_index(drop=True)
        print('删除短文本后评论数目：',len(new_text))
        
        # （4）保存
        self.clean_data = new_text
        return new_text
    
    def cal_tf_idf(self):
        
        new_text = self.clean_data
        
        # 引入停用词
        new_stopwords = [line.strip() for line in open("chineseStopWords.txt", 'r', encoding='GBK').readlines()]
        for sw in new_stopwords:
            ja.TFIDF.STOP_WORDS.add(sw)
        
        query_keywords = ['北京','暴雨','大雨']
        for qk in query_keywords:
            ja.TFIDF.STOP_WORDS.add(qk)

        # 合并所有的comments 
        all_comments = ""
        for i in tqdm(range(len(new_text))):
            tt = re.sub(r'北京|暴雨|大雨','',new_text[i])
            all_comments += tt + ' '
        print("Length of comments: ",len(all_comments))
        
        keywords = ja.extract_tags(sentence=all_comments, topK=200, withWeight=True, allowPOS=('n','ns','nr'))
        word_tf_idf = {x[0]:x[1] for x in keywords}
        df_tf_idf = pd.DataFrame({'Keyword':list(word_tf_idf.keys()), 'TF_IDF':list(word_tf_idf.values())})
        df_tf_idf.to_csv('df_tf_idf.csv')
        return df_tf_idf

In [75]:
wr_1 = werain_v1()

In [130]:
new_text = wr_1.data_clean()

【获得全部文本】：
全部微博文本的数目为： 94043 

【短文本清洗】：
   length_text                                           new_text
0           69  这是不是无耻到一定境界了抗击暴雨的普通百姓居然是北京精神的作用人们的团结热心居然是某几位领导...
1          127  对比北京暴雨俄罗斯洪灾名官员因失职被捕俄罗斯南部于今年月日遭暴雨袭击并引发洪灾亡人数达人此前...
2           19                                北京市民坚持住众志成城定可度过暴雨阶段 

【删除过短文本】：
删除短文本前评论数目： 94043
删除短文本后评论数目： 77261


In [77]:
df_tf_idf = wr_1.cal_tf_idf()

100%|█████████████████████████████████████████████████████████████████████████| 77261/77261 [00:06<00:00, 12047.69it/s]


Length of comments:  5410194


In [124]:
words = df_tf_idf['Keyword'].to_list()

In [141]:
data = pd.DataFrame(new_text)
data.drop_duplicates(inplace=True)
data = data.reset_index(drop=True)

In [None]:
in_words = list()
for i in tqdm(range(len(data))):
    in_word = list()
    for word in words:
        if(word in data['new_text'][i]):
            in_word.append(word)
    in_words.append(in_word)
    

In [145]:
data['in_words'] = in_words

In [156]:
data['new_text'][63089]

'北京众志成城等雨媒体也夸张的说了好几天看来北京人这次是被雨下到了期待真正有效的排水系统产生永久解决后顾之忧'

In [146]:
data

Unnamed: 0,new_text,in_words
0,这是不是无耻到一定境界了抗击暴雨的普通百姓居然是北京精神的作用人们的团结热心居然是某几位领导...,"[人们, 领导, 精神, 结果]"
1,对比北京暴雨俄罗斯洪灾名官员因失职被捕俄罗斯南部于今年月日遭暴雨袭击并引发洪灾亡人数达人此前...,"[政府, 洪水, 人数]"
2,北京市民坚持住众志成城定可度过暴雨阶段,[市民]
3,北京暴雨死了这么多人不但新闻上不了头条连主持人都穿着这么鲜艳的衬衣真是一个毫无人性的国家,"[新闻, 国家]"
4,大眼好样的从什邡到北京暴雨那些公知们忙着晒裸照约口水战粉饰太平时很高兴看到还有你这样有良知的...,"[城市, 下水道, 国家]"
5,截至年月日时北京暴雨后共发现人死亡年以来全国的人民和财力对北京支持不知有多少结果建成了个这样...,"[人民, 世界, 国家, 结果, 京城]"
6,微博点亮正能量倒数第四段提到那个曾大骂香港人是狗的大正幸灾乐祸这一夜我们目睹美与丑一夜之间北...,"[城市, 市民, 被淹, 人们, 车辆, 香港, 个别, 倾盆, 司机, 人员, 罚单]"
7,北京的暴雨太可怕了幸好爸妈最近都不在那边亲戚们一切平安真是最大的安慰了,[]
8,北京连下场雨都这么恐怖转发微博北京广渠门被淹在车里的人没有生命体征了痛心在这里告诉大家如何车...,"[大家, 广渠门, 被淹, 下场, 车窗]"
9,在理伦敦汉堡不知道纽约下几场大雨也会有很多地方积水但我在纽约还没见过这么大的雨这种比较其实没...,"[积水, 降雨, 雨量, 广渠门, 地方, 降雨量, 伦敦, 水深]"


In [151]:
with open("user_dict.txt","w",encoding="utf-8") as f:
    f.write('排水系统')

In [152]:
jieba.load_userdict("user_dict.txt")

In [159]:
keywords = ja.extract_tags(sentence=data['new_text'][63089], topK=10, withWeight=True, allowPOS=('n','ns','nr'))
keywords    

[('众志成城', 2.3005564758399997),
 ('北京', 1.866960923488),
 ('媒体', 1.200973854978),
 ('产生', 0.908125001354)]