In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations

In [2]:
def stopwords_list(file_path):
    "輸入以utf-8編碼的txt文件，文件以回車分隔停用詞"
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        stopwords = f.read().split('\n')
        stopwords = [word.strip() for word in stopwords if word.strip()]
    return stopwords

In [3]:
def get_files(folder_path):
    # 初始化一個空列表來存儲文件路徑
    file_paths = []

    # 遍歴文件夾及其所有子文件夾
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # 檢查文件是否爲xlsx文件
            if file.endswith('.xlsx'):
                # 將完整的文件路徑添加到列表中
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
    return file_paths

In [13]:
def process_file(folder_path,stopwords,num_keywords=20):
    '''
    num_keywords是提取多少個tf-idf得分最高的關鍵詞
    '''
    xlsx_files = get_files(folder_path)
    documents = []
    big_df = pd.DataFrame(columns=['分詞']) 
    for file_path in xlsx_files:
        print(file_path)
        # 讀取文件
        df = pd.read_excel(file_path)

        # 檢查是否存在“分詞”列
        if '分詞' in df.columns:
            # 提取並處理“分詞”列的內容
            df = df[df['X'].notna()] # 坐標部分不爲空的行才是地方志數據，其他的是非方志數據
            df = df.drop_duplicates(subset=['分詞'])
            document = df['分詞'].str.cat(sep=' ')
            documents.append(document)
            big_df = pd.concat([big_df, df[['分詞']]], ignore_index=True)

            
    # 使用TfidfVectorizer計算TF-IDF得分
    vectorizer = TfidfVectorizer(stop_words=stopwords)
    tfidf_matrix = vectorizer.fit_transform(documents)

    # 獲取詞彙表
    feature_names = vectorizer.get_feature_names_out()

    # 對於每個文檔，選擇得分最高的詞作為關鍵詞
    keywords = []
    for i in range(tfidf_matrix.shape[0]):
        row = tfidf_matrix[i, :].toarray().flatten()
        top_indices = row.argsort()[-num_keywords:][::-1]
        top_keywords = [feature_names[j] for j in top_indices]
        keywords.append(top_keywords)

    # 構建共現矩陣
    unique_keywords = list(set(keyword for sublist in keywords for keyword in sublist))
    keyword_index = {keyword: idx for idx, keyword in enumerate(unique_keywords)}
    cooccurrence_matrix = np.zeros((len(unique_keywords), len(unique_keywords)))
    
    pair_combinations = list(combinations(unique_keywords, 2)) # 關鍵詞的兩兩配對組合
    
    for keyword1, keyword2 in pair_combinations:
        count = (big_df['分詞'].str.contains(keyword1) & big_df['分詞'].str.contains(keyword2)).sum()
        cooccurrence_matrix[keyword_index[keyword1], keyword_index[keyword2]] = count

    # 將共現矩陣轉換爲DataFrame
    cooccurrence_df = pd.DataFrame(cooccurrence_matrix, index=unique_keywords, columns=unique_keywords)
    output_path =  os.path.join(folder_path, '關鍵詞共現矩陣.csv')
    cooccurrence_df.to_csv(output_path, encoding="utf-8", index=True)

In [None]:
stopwords = stopwords_list(r'F:\古籍處理數據\input\繪圖\停用詞.txt')
folder_path = r'E:\坚果云同步文件\论文内容\博士階段論文\畢業論文\論文數據\語料分析數據\已篩選-方言\春季'
process_file(folder_path,stopwords)