In [43]:
import numpy as np
import pandas as pd
import datetime
import os
import re
from parsel import Selector

### 一、数据清洗

In [66]:
path = "nk_media"
files = os.listdir(path)
medias = ['北方网','光明日报','今晚报','北京日报','津云','经济日报','科技日报','科学网','人民日报','天津日报','新华网','中国教育报','中国科学报','中国青年报','中国新闻网']

In [67]:
# 删除不以上面列表中任意元素开头的文件
for file in files:
    if not any(file.startswith(media) for media in medias):
        file_path = os.path.join(path,file)
        os.remove(file_path)

In [69]:
# 处理csv文件
media_csv = pd.read_csv("nk_media.csv")

In [71]:
index = []
for i in range(len(media_csv)):
    if not any(media_csv.iloc[i].title.startswith(media) for media in medias):
        index.append(i)

In [None]:
for i in index[::-1]:
    media_csv.drop(i,inplace=True)

In [107]:
# 再进行两次检验，确保剩下的文件是csv和HTML文件夹中文件的交集
files = os.listdir(path)
for file in np.array(files):
    true_name = file.split(".html")[0]
    if true_name not in media_csv.title.to_numpy():
        os.remove(os.path.join(path,file))

In [112]:
files = os.listdir(path)
index = []
for i in range(len(media_csv)):
    if media_csv.iloc[i].title not in [file.split(".html")[0] for file in files]:
        index.append(i)
for i in index[::-1]:
    media_csv.drop(i,inplace=True)

In [None]:
media_csv.reset_index(inplace=True)
media_csv.pop('index')

In [131]:
media_csv.to_csv("nk_media.csv")

### 二、数据预处理

In [37]:
# 将两个csv合并，然后再手动合并两个html文件夹为一个文件夹，命名为htmls
news_csv = pd.read_csv("nk_news.csv",encoding='utf-8',index_col=0)
media_csv = pd.read_csv("nk_media.csv",encoding='utf-8',index_col=0)
data = pd.concat([news_csv,media_csv],axis=0)
data.to_csv("data.csv")

In [38]:
# 为本地文件夹下的所有HTML文档添加description，用于后续分词构建索引
def add_description(path="htmls"):
    files = os.listdir(path)
    for file_name in files:
        file_path = os.path.join(path,file_name)
        with open(file_path,'r',encoding='utf-8') as file:
            content = file.read()
            selector = Selector(content)
            title = selector.css('title::text').get()
            _title = title.replace('/', '_')
            # 获取head内的以description为类名的meta标签内容
            description = selector.css('meta[name="description"]::attr(content)').get()
            if description is not None:  # 去除空字符
                description = description.replace('\r', '').replace('\n', '').replace('\t', '').replace('\n', '').replace('　', '')
            data.loc[_title, 'description'] = description

In [39]:
add_description()

In [40]:
data.to_csv("data_with_description.csv")

In [52]:
allInfo = pd.DataFrame(columns=['title', 'description', 'date_timestamp', 'content', 'editor'])
allInfo.index.name = 'url'

In [61]:
# 获取一个HTML文档的全部信息，包括描述，发布时间，作者，正文等
def getALlInfo(path="htmls"):
    files = os.listdir(path)
    for file_name in files:
        file_path = os.path.join(path,file_name)
        with open(file_path,'r',encoding='utf-8') as file:
            content = file.read()
            selector = Selector(content)
            title = selector.css('title::text').get()
            _title = title.replace('/', '_')
            url = data.loc[_title, 'url']
            # 获取head内的以description为类名的meta标签内容
            description = selector.css('meta[name="description"]::attr(content)').get()
            if description is not None:  # 去除空字符
                description = description.replace('\r', '').replace('\n', '').replace('\t', '').replace('\n', '').replace('　', '')
            _content: list = selector.css('p::text').getall()
            # 作者信息单独保存到一列
            content = "".join(_content[:-1])
            if _content != []:  # 去除空字符
                content = content.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', ' ').replace('　', '')
                editor = _content[-1].replace('\n', '').replace(' ', '')
            else:
                editor = None
            # 找到形如 YYYY/MM/DD/ 的字符串
            regex = re.search(r'(20)\d{2}/(0?[1-9]|1[012])/(0?[1-9]|[12][0-9]|3[01])/', url, re.S)
            if bool(regex):
                date = regex.group()
                date_timestamp = datetime.datetime.strptime(date, '%Y/%m/%d/').timestamp()  # 转换时间戳存储
            else:
                date_timestamp = None
            allInfo.loc[url] = [title, description, date_timestamp, content, editor]

In [56]:
getALlInfo()

In [58]:
allInfo.to_csv("allInfo.csv")