In [65]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import re
import emoji
import jieba
import jieba.posseg as pseg #这个包可以标注词性，我们只需要n,v,a
from zhconv import convert

import matplotlib.pyplot as plt
%matplotlib inline

#### 单平台数据初步预处理

In [82]:
df_zhihu = pd.read_csv('../ITP/data_zhihu_addition.csv')
df_zhihu['source'] = 'zhihu'
print(f'Zhihu Addition Set Shape = {df_zhihu.shape}')  
print('Zhihu Addition Set Memory Usage = {:.2f} MB'.format(df_zhihu.memory_usage().sum() / 1024**2))

Zhihu Addition Set Shape = (101694, 9)
Zhihu Addition Set Memory Usage = 6.98 MB


In [83]:
df_zhihu.head()

Unnamed: 0,author,fans_count,content,created_time,updated_time,comment_count,voteup_count,url,source
0,一只可尼熊,12,不建议 图情不是重点院校的话 很水,2019-08-31 15:33:32,2019-08-31 15:33:32,0,0,https://www.zhihu.com/api/v4//answers/809074756,zhihu
1,知乎用户,71,在某些領域相當一部份都是華人，而且內地高校有些水平也不錯，拿過頂會best paper（比如...,2014-12-04 11:10:27,2015-01-06 16:25:50,1,2,https://www.zhihu.com/api/v4//answers/34518749,zhihu
2,Chris,73,题主问的是中国AI水平，下面一堆人回答华人AI水平，他们代表的很多是美国的水平吧？,2017-08-25 08:24:07,2017-08-25 08:24:07,0,2,https://www.zhihu.com/api/v4//answers/220067758,zhihu
3,知乎用户,120,这个说来也挺奇怪的。其实在美帝很多高校做这块的都是华人，甚至可以放宽一些，计算机系里面华人比...,2014-12-06 18:44:39,2014-12-06 18:44:39,1,1,https://www.zhihu.com/api/v4//answers/34657913,zhihu
4,知乎用户,55,这问题让人如何答好吧，不错的水平,2014-12-09 08:22:52,2014-12-09 08:22:52,0,0,https://www.zhihu.com/api/v4//answers/34797357,zhihu


In [84]:
# rename columns
df_zhihu = df_zhihu.rename(columns={'content':'comments','updated_time':'time'})

# convert data type
df_zhihu['time'] = df_zhihu['time'].astype('datetime64[ns]')

# extract data(source, topics, comments, time)
df_zhihu = df_zhihu[['source','comments','time']]

Merge data from four platforms...

In [85]:
## merge data from four platforms
df = df_zhihu.copy()

#### 规范化预处理（Cleansing）

In [86]:
# drop duplicated
print('Data size = ',df.shape)
df = df.drop_duplicates(subset='comments')
# print('Duplicates data size = ',df.duplicated(subset='comments').sum())
print('After drop duplicated data size = ',df.shape)

# drop data missing time or comments
df.dropna(subset=['comments'],inplace=True)
df.dropna(subset=['time'],inplace=True)
print('After drop missing data size = ',df.shape)
df.index = range(0,len(df))

Data size =  (101694, 3)
After drop duplicated data size =  (98561, 3)
After drop missing data size =  (98560, 3)


In [87]:
# data clean
def clean(text):
    '''
    正则化
    '''
    
    # 清除@和回复/转发中的用户名
    text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", "", text).strip()
    
    # 清除emoji表情
    text = emoji.demojize(text).strip()   
    
    # 清除##话题内容
    text = re.sub(r"#\S+#", "", text).strip()    
    
    # 清除网址link
    URL_REGEX1 = re.compile('</?\w+[^>]*>')
    URL_REGEX2 = re.compile('(https|http|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
    text = re.sub(URL_REGEX1, "", text).strip()
    text = re.sub(URL_REGEX2, "", text).strip()
    
    # 清除数字
    text = re.sub('[\d]+','',text).strip()
    
    # 清除字母
    text = re.sub('[a-zA-Z]+','',text).strip()
    
    # 清除无意义的词语
    text = text.replace("转发微博", "").strip()
    
    # 合并多余的空格
    text = re.sub(r"\s+", " ", text).strip() 
    
    # 清除标点
    r = "[_.+-=——$%^~@#￥%……&*《》<>「」{}【】()“”\"/]+"
    text = re.sub(r, '', text).strip()
    
    return text

In [88]:
def simplify(text):
    try:
        return convert(text,'zh-hans')
    except:
        return None

In [89]:
tqdm.pandas()
df['comments'] = df['comments'].progress_apply(clean) # 综合正则化处理
df['comments'] = df['comments'].progress_apply(simplify) # 繁体字简体化
# df['topics'] = df['topics'].progress_apply(clean) # 繁体字简体化

100%|███████████████████████████████████| 98560/98560 [00:11<00:00, 8236.61it/s]
100%|███████████████████████████████████| 98560/98560 [00:10<00:00, 9737.37it/s]


In [90]:
# drop duplicated
print('Data size = ',df2.shape)
df2.drop_duplicates(subset='comments',inplace=True)
print('After drop duplicated data size = ',df2.shape)

Data size =  (97906, 3)
After drop duplicated data size =  (97906, 3)


#### 数据过滤预处理（overview_data）

In [91]:
def detect_encoding(file):
    #获取文件最可能的编码格式
    import chardet
    with open(file, "rb") as f:
        r = f.read()
    e = chardet.detect(r)
    encoding = e.get("encoding")
    return encoding

def get_stopwords(stop_words_file, encoding):
    """
    读取停用词文件
    参数：
        停用词文件
    返回：
        停用词列表
    """
    
    with open(stop_words_file,encoding = encoding) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

def get_tokenization(text):
    """
    使用jieba分词并剔除停用词
    参数：
        文本字符串
    返回：
        以空格连接的词串
    """
    word_token = []
    for word in jieba.lcut(text.strip(), cut_all=False): #精确模式
        if word not in STOPWORDS:
            word_token.append(word)
    return ' '.join(word_token)

def clear_singlechar(text):
    """
    清除切词后的单个字符
    """
    word_cleared = []
    for word in text.strip().split(r' '):
        if len(word)!=1:
            word_cleared.append(word)
    return ' '.join(word_cleared)

def pos_tag_filter_nva(text):
    finance_noisy = ["疫情", "股市", "风险投资", "风险融资", "风险资本", "信贷风险", "贷款风险", "投资风险", "交易风险", "信用风险", "风险评分", "金融风险", "基金", "个股"]
    
    words = pseg.cut(text)
    filtered_words = []
    for word, flag in words:
        # a = ("{0}/{1}".format(word, flag))
        if (flag == 'n'or flag == 'v' or flag == 'a') and (len(word) > 1) and word not in finance_noisy:
            filtered_words.append(word)
    return ' '.join(filtered_words)

In [92]:
# 停用词表
stop_qua = r'ー、一、二、三、四、五、一点、两个、三个、一些、一种、几个、几种；和、跟、与、既、同、及、而、况、况且、何况、乃至、则、乃、就、而、便、于是、然后、至于、说到、此外、像、如、一般、比方、接着、却、虽然、但是、然而、而、偏偏、只是、不过、至于、致、不料、岂知、原来、因为、由于、以便、因此、所以、是故、以致、或、或者、还是、抑、非…即、不是…就是、若、如果、若是、假如、只要、除非、假使、倘若、即使、假若、要是、譬如、像、好比、如同、似乎、等于；不如、不及、与其…不如、若…则、虽然…可是、虽然、固然、尽管、纵然、即使、不但、不仅、而且、何况、并、且、不管、只要、除非、以、以便、以免、为了'

STOPWORDS = list(set(get_stopwords('../ITP/Data collection/stopwordsHIT.txt', encoding=detect_encoding('../ITP/Data collection/stopwordsHIT.txt')) + get_stopwords(r'../ITP/Data collection/stopwords.txt', encoding='gbk') + stop_qua.split(r'、')))
print(f'Number of stop words = {len(STOPWORDS)}')

Number of stop words = 2576


In [93]:
# 切词
df = df.copy()
jieba.load_userdict('../ITP/Data collection/dict.txt')

tqdm.pandas()
df['tokenization'] = df['comments'].progress_apply(get_tokenization)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/29/27cgqwrx7_18wcd66rpffzlc0000gn/T/jieba.cache
Loading model cost 0.367 seconds.
Prefix dict has been built successfully.
100%|████████████████████████████████████| 98560/98560 [07:46<00:00, 211.41it/s]


In [94]:
# 清除单个字符
df = df.copy()
tqdm.pandas()
df['tokenization'] = df['tokenization'].progress_apply(clear_singlechar)

100%|██████████████████████████████████| 98560/98560 [00:01<00:00, 94737.35it/s]


In [95]:
# 过滤切词结果，保留n/v/adj
df = df.copy()
jieba.load_userdict('../ITP/Data collection/dict.txt')

tqdm.pandas()
df['tokenization_filtered'] = df['tokenization'].progress_apply(pos_tag_filter_nva)

100%|████████████████████████████████████| 98560/98560 [03:06<00:00, 529.59it/s]


#### 对微博含有广告、打榜等重复噪音数据的补充处理

In [None]:
# # 1.根据关键词对微博数据再做初筛 (主要针对微博数据)
# words_list = ['风险','安全隐患','威胁','危险','危机','后果','危害','安全事件','预防','安全事故','风险管理','负面影响','灾害','困境','欺诈','失误','漏洞']
# risk_words = str()
# for item in words_list:
#     risk_words = risk_words+str(item)+'|'
    
# # 筛掉所有不含有“人工智能”和“risk_words”中关键词的内容
# data_weibo_filtered = data_weibo[(data_weibo['tokenization_filtered'].str.contains("人工智能"))==True]

# # 筛掉所有不含有“人工智能”和“words_list”中关键词的内容
# data_weibo_filtered= data_weibo_filtered[(data_weibo_filtered['tokenization_filtered'].str.contains(risk_words))==True]

In [None]:
# # 2.使用fuzzywuzzy库，对数据进行遍历，剔除掉相似度75%以上的数据（主要针对微博数据）

# # Final Version (itertools.islice)
# def Similarity(index_tuples_list):
#     res = []
# #     print(index_tuples_list)
#     for p in index_tuples_list:
#         similarity = fuzz.ratio(data_weibo_filtered['comments'][p[0]],data_weibo_filtered['comments'][p[1]])
#         if similarity > 75:
#             res.append([p[0],p[1],similarity])
#     return res

# pair_similarity = []
# begin_all = time.time()

# obj_list = []
# executor = ThreadPoolExecutor(max_workers=8)

# # 分批次 1000000000*18
# for i in tqdm(range(3000000000,4000000000,5000000),position=0, leave=True):
#     obj_list.append(itertools.islice(itertools.combinations(data_weibo_filtered.index.to_list(),2),i,i+5000000)) 

# for res in executor.map(Similarity, obj_list):
#     if (len(res)) > 0:
#         pair_similarity.extend(res)
    
# end_all = time.time()
# print(end_all-begin_all)

In [97]:
# 3.删除分词过滤后无词的数据
print(df.shape)
df.dropna(subset=['tokenization_filtered'], inplace=True)
# df.reset_index(inplace=True, drop=True)
print(df.shape)

(98560, 5)
(98560, 5)


In [98]:
# 4.删除分词+过滤后词数<5的数据(可选操作)
'''
item_word_below_5 = []

tqdm.pandas()
df['word_count'] = df['tokenization_filtered'].progress_apply(lambda x: len(x.split()))
df = df[df['word_count']>=5][['source','comments','time','tokenization','tokenization_filtered','word_count']]# ,'topics'
print(df.shape)
'''

100%|█████████████████████████████████| 98560/98560 [00:00<00:00, 447467.00it/s]

(71700, 6)





In [101]:
df[['source','comments','time','tokenization','tokenization_filtered']].to_csv('data_zhihu_addition_filtered.csv', index=False, encoding='utf-8-sig')