In [8]:
import pandas as pd
import numpy as np
import os
import json

In [25]:
data = pd.read_csv("cutted.csv", encoding='utf-8-sig',index_col='url')

In [13]:
# 创建目录，存储倒排索引文件
os.mkdir("./jsons")

In [14]:
# 读取停用词表
stopWords = []
with open("stopwords.txt",'r',encoding='utf-8') as f:
    words = f.read().splitlines()
stopWords.extend(words)

In [61]:
# 为每个HTML文档计算词频，用于后续构建倒排索引
def calculateTFInHTML(data=data,title_only=False):
    # 要返回的“正向”索引词典，映射关系为：url->{}，而作为value的词典是一个词项到词频的映射
    index = {}
    # 遍历data的每一行，遍历标题，内容等信息，计算词频
    for url,info in data.iterrows():
        index[url] = {}
        for word in info.title.split(" "):
            # 不在停用词列表中的才计数
            if word not in stopWords:
                if word not in index[url]:
                    index[url][word] = 1
                else:
                    index[url][word] += 1
        if title_only == False:
            for word in str(info.description).split(" "):
                if word not in stopWords:
                    if word not in index[url]:
                        index[url][word] = 1
                    else:
                        index[url][word] += 1
            for word in str(info.content).split(" "):
                if word not in stopWords:
                    if word not in index[url]:
                        index[url][word] = 1
                    else:
                        index[url][word] += 1
            for word in str(info.editor).split(" "):
                if word not in stopWords:
                    if word not in index[url]:
                        index[url][word] = 1
                    else:
                        index[url][word] += 1
    return index

In [62]:
index = calculateTFInHTML()

In [64]:
index_only_title = calculateTFInHTML(data,True)

In [97]:
# 倒排索引构建：
def gen_inverted_index(index):
    inverted_index = {}
    # 遍历原始索引中的每个URL和对应的词汇及频率信息
    for url, words in index.items():
        # 遍历每个URL对应的词汇及其频率信息
        for word, frequency in words.items():
            # 如果当前词汇不在倒排索引中，就创建一个空字典作为该词的索引项
            if word not in inverted_index:
                inverted_index[word] = {}
                inverted_index[word][url] = frequency
            else:
                inverted_index[word][url] = frequency
    return inverted_index

In [98]:
inverted_index = gen_inverted_index(index)

In [101]:
inverted_index_only_title = gen_inverted_index(index_only_title)

In [76]:
# 计算TF：
def get_TF(index):
    tf = {}
    for url, words in index.items():
        temp_dict = {}
        for word, frequency in words.items():
            if word not in temp_dict:
                temp_dict[word] = 1
            else:
                temp_dict[word] += 1
        tf[url] = temp_dict
    return tf

In [80]:
# 计算IDF：
def get_IDF(index):
    idf = {}
    for url, frequency_dict in index.items():
        for word, frequency in frequency_dict.items():
            idf[word] = np.log(len(index) / frequency)
    return idf

In [85]:
IDF = get_IDF(index)

In [84]:
IDF_only_title = get_IDF(index_only_title)

In [88]:
# 计算tf-idf值：
def get_TF_IDF(index,IDF):
    tf_idf = {}
    for url, words in index.items():
        dict = {}
        for word, frequency in words.items():
            dict[word] = frequency * IDF[word]
        tf_idf[url] = dict
    return tf_idf

In [90]:
TF_IDF = get_TF_IDF(index,IDF)

In [91]:
TF_IDF_only_title = get_TF_IDF(index_only_title,IDF_only_title)

In [92]:
# 保存各个文档的TF-IDF值
with open('./jsons/tf-idf.json', 'w', encoding='utf-8') as f:
    json.dump(TF_IDF, f, ensure_ascii=False)
with open('./jsons/tf-idf_title.json', 'w', encoding='utf-8') as f:
    json.dump(TF_IDF_only_title, f, ensure_ascii=False)

In [102]:
# 保存倒排索引为json格式，便于前端使用
with open('./jsons/invert_index.json', 'w', encoding='utf-8') as f:
    json.dump(inverted_index, f, ensure_ascii=False)
with open('./jsons/invert_index_title.json', 'w', encoding='utf-8') as f:
    json.dump(inverted_index_only_title, f, ensure_ascii=False)

In [107]:
# 下面计算一下HTML库中所有非停用词的词频，方便推荐系统实现以及词云功能实现：
def getAllTF(index):
    word_frequency = {}
    for url, words in index.items():
        for word, frequency in words.items():
            if word not in word_frequency:
                word_frequency[word] = 1
            else:
                word_frequency[word] += 1
    return word_frequency

In [108]:
with open('./jsons/allTF.json', 'w', encoding='utf-8') as f:
    json.dump(getAllTF(index), f, ensure_ascii=False)

下面是一些结果展示：

In [3]:
with open('jsons/invert_index.json', 'r', encoding='utf-8') as file:
    data_dict = json.load(file)
len(data_dict['计算机'])

288

In [9]:
with open('jsons/tf-idf.json', 'r', encoding='utf-8') as file:
    tfidf_dict = json.load(file)
tfidf_dict['http://news.nankai.edu.cn/ywsd/system/2021/10/17/030048360.shtml']

{'亿': 8.638348312972704,
 '校友': 32.79453385646435,
 '献礼': 8.638348312972704,
 '南开': 116.91539160050938,
 '化学': 181.4053145724268,
 '学科': 127.12321811860414,
 '百年': 21.086731201615812,
 '开大': 72.52053951852814,
 '大学': 72.14797448422156,
 '南开大学': 72.52053951852814,
 '记者': 15.890402264825518,
 '蓝芳': 25.915044938918115,
 '摄影': 15.890402264825518,
 '宗': 15.890402264825518,
 '琪琪': 15.890402264825518,
 '在': 52.47125417034295,
 '建校': 15.890402264825518,
 '周年': 33.46219081958696,
 '创建': 23.835603397238277,
 '之际': 17.27669662594541,
 '为': 30.15894409721838,
 '助力': 17.27669662594541,
 '更好': 17.27669662594541,
 '更': 25.915044938918115,
 '快': 17.27669662594541,
 '发展': 63.56160905930207,
 '学系': 17.27669662594541,
 '化学系': 17.27669662594541,
 '赵国': 15.890402264825518,
 '锋': 15.890402264825518,
 '阚颖': 15.890402264825518,
 '郑保富': 17.27669662594541,
 '共同': 25.915044938918115,
 '支持': 30.15894409721838,
 '亿元': 17.27669662594541,
 '发起': 17.27669662594541,
 '设立': 17.27669662594541,
 '基金': 25.915044938918115,