In [3]:
!pip install transformers torch



In [11]:
import os
import re
import torch
import pdfplumber
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [29]:
from nltk.corpus import stopwords
import nltk

In [30]:
# nltkのストップワードをダウンロード（初回のみ）
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sk062\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def extract_text_from_pdfs( folder_path ):
    """
    指定フォルダ内の複数PDFからテキストを抽出する。
    """
    pdf_texts = {}
    for file_name in os.listdir( folder_path ):
        if file_name.endswith( '.pdf' ):  # PDFファイルのみ対象
            with pdfplumber.open(os.path.join( folder_path, file_name) ) as pdf:
                text = ''.join([ page.extract_text () for page in pdf.pages if page.extract_text()])
                if text.strip():  # テキストが空でない場合のみ
                    pdf_texts[file_name] = text
    return pdf_texts

In [6]:
folder_path = r"C:\Users\sk062\OneDrive\デスクトップ\資料\Note info" 

In [8]:
pdf_texts = extract_text_from_pdfs(folder_path)

In [9]:
pdf_texts

{'A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation Based on Large Language Models Enhanced by Domain Knowledge Retrieval.pdf': 'A Soft Sensor Method with Uncertainty-Awareness and Self-\nExplanation Based on Large Language Models Enhanced by\nDomain Knowledge Retrieval\nAuthor Information\nShuo Tong, Han Liu , Runyuan Guo, Wenqing Wang, Xueqiong Tian, Lingyun Wei, Lin Zhang, Huayong\nWu, Ding Liu, Youmin Zhang\ne-mail: liuhan@xaut.edu.cn\nAffiliations\nSchool of Automation and Information Engineering, Xi’an University of Technology, Xi’an, China.\nShuo Tong, Han Liu, Runyuan Guo, Wenqing Wang, Xueqiong Tian, Lingyun Wei, Lin Zhang, Huayong Wu,\nDing Liu\nDepartment of Mechanical, Industrial, and Aerospace Engineering and the Concordia Institute of\nAerospace Design and Innovation, Concordia University, Montreal, Canada.\nYoumin Zhang\nCorresponding author\nCorrespondence to: Han Liu\nAbstract\nData-driven soft sensors are crucial in predicting key performance indica

In [44]:
def clean_and_tokenize(text, custom_stop_words=None):
    """
    一文字ごとに分割されてしまった単語を復元し、ストップワードや不要な単語を削除する。
    """
    if not isinstance(text, str):  # 念のため型チェック
        raise TypeError("text must be a string")
    
    # アルファベットの単語を復元（単独の文字がスペースで分割されている場合に連結）
    text = re.sub(r'(?<!\w) ([a-zA-Z]) (?!\w)', r'\1', text)
    
    # 単語を抽出（英単語のみ対象、数字や記号を除外）
    words = re.findall(r'[a-zA-Z]+', text)

    # ストップワードを除外しつつ、一文字の単語も削除
    words = [word.lower() for word in words if word.lower() not in stop_words and len(word) > 1]

    # ユーザー指定の削除単語を適用
    if custom_stop_words:
        words = [word for word in words if word.lower() not in custom_stop_words]
    
    return words

In [45]:
def process_pdf_texts(pdf_texts):
    """
    各PDFのテキストをクリーンアップして単語リストに変換する。
    """
    cleaned_texts = {}
    
    for file_name, text in pdf_texts.items():
        words = clean_and_tokenize(text)  # クリーニング処理
        cleaned_texts[file_name] = words
    
    return cleaned_texts

In [46]:
def count_word_frequencies(cleaned_texts):
    """
    各PDFの単語リストから英単語の頻度をカウントする。
    """
    word_frequencies = {}
    
    for file_name, words in cleaned_texts.items():
        counter = Counter(words)
        word_frequencies[file_name] = counter
    
    return word_frequencies

In [49]:
def get_top_n_words(word_frequencies, n=20):
    """
    各PDFの英単語の頻度から最頻ワード上位N個を取得する。
    """
    top_words = {}
    
    for file_name, counter in word_frequencies.items():
        top_n_words = counter.most_common(n)
        top_words[file_name] = top_n_words
    
    return top_words

In [52]:
cleaned_texts = process_pdf_texts(pdf_texts)  # 2. クリーニング処理

In [56]:
custom_words = {"dr", "ci", "tn","tp","fp","et","al","fn","et","apt","ee","ij","na","et","ee","er","te","rn","ie","en","oe","se","et","td","ro","ne","rq"}  # 削除したい単語リスト
cleaned_texts = process_pdf_texts(pdf_texts) 

cleaned_words = {}
for file_name, words in cleaned_texts.items():
    cleaned_words[file_name] = clean_and_tokenize(' '.join(words), custom_stop_words=custom_words)

In [57]:
word_frequencies = count_word_frequencies(cleaned_words)  # 3. 単語頻度カウント
top_words = get_top_n_words(word_frequencies)  # 4. 最頻ワード取得

In [68]:
# cleaned_word
# word_frequencies
# top_words

In [71]:
df = pd.DataFrame()

notion_df = pd.read_csv(r"C:\Users\sk062\OneDrive\デスクトップ\9c2e4d3d-864b-490e-b35a-b141df14fc43_Export-00b74899-a19a-4d27-9b59-9b23b5916742\Note paper (ai) 174225966a7b8067b3ecd5385709bbb4.csv")

df = notion_df

In [74]:
def top_words_to_df_with_columns(top_words, n=20):
    """
    最頻ワードをDataFrameの列に変換する。
    """
    # 最初に空の辞書を用意
    all_top_words = {}
    
    # ファイルごとに最頻ワードを列として追加
    for file_name, top_n_words in top_words.items():
        # ファイルごとの最頻ワードリスト
        top_words_dict = {'file_name': file_name}
        
        # 最頻ワード1から最頻ワードnまでを辞書に追加
        for rank, (word, frequency) in enumerate(top_n_words, start=1):
            top_words_dict[f'most_recent_word{rank}'] = word
            top_words_dict[f'frequency{rank}'] = frequency
        
        # 辞書をall_top_wordsに追加
        all_top_words[file_name] = top_words_dict
    
    # all_top_wordsからDataFrameを作成
    df_top_words = pd.DataFrame.from_dict(all_top_words, orient='index')
    
    # 結果表示
    return df_top_words

# 最頻ワードをDataFrameに変換
df_top_words = top_words_to_df_with_columns(top_words)

# 結果表示
df_top_words


Unnamed: 0,file_name,most_recent_word1,frequency1,most_recent_word2,frequency2,most_recent_word3,frequency3,most_recent_word4,frequency4,most_recent_word5,...,most_recent_word16,frequency16,most_recent_word17,frequency17,most_recent_word18,frequency18,most_recent_word19,frequency19,most_recent_word20,frequency20
A Soft Sensor Method with Uncertainty-Awareness and Self-Explanation Based on Large Language Models Enhanced by Domain Knowledge Retrieval.pdf,A Soft Sensor Method with Uncertainty-Awarenes...,llm,178,soft,98,ufss,91,data,86,fig,...,variables,50,confidence,46,methods,45,prediction,45,pt,42
AI-Driven Diabetic Retinopathy Screening Multicentric Validation of AIDRSS in India.pdf,AI-Driven Diabetic Retinopathy Screening Multi...,aidrss,32,screening,20,specificity,20,diabetic,19,retinopathy,...,fundus,11,using,11,feature,11,fig,10,india,9
Constraints as Rewards Reinforcement Learning for Robots without Reward Functions.pdf,Constraints as Rewards Reinforcement Learning ...,cid,57,constraint,53,robot,44,learning,39,task,...,design,20,proposed,20,method,19,qrsac,19,trained,17
CONTINUUM Detecting APT Attacks through Spatial-Temporal Graph Neural Networks.pdf,CONTINUUM Detecting APT Attacks through Spatia...,graph,65,data,44,model,38,detection,33,node,...,spatial,19,using,19,benign,19,ids,18,federated,18
DiReCT Diagnostic Reasoning for Clinical Notes via Large Language Models.pdf,DiReCT Diagnostic Reasoning for Clinical Notes...,gpt,41,response,39,observation,38,disease,37,note,...,stroke,17,figure,16,diagnostic,15,however,15,respectively,15
"DPO Kernels A Semantically-Aware, Kernel-Enhanced, and Divergence-Rich Paradigm for Direct Preference Optimization.pdf","DPO Kernels A Semantically-Aware, Kernel-Enha...",cid,812,kernel,255,log,175,kernels,154,rbf,...,yy,68,based,67,model,61,data,60,local,60
DynaGRAG Exploring the Topology of Information.pdf,DynaGRAG Exploring the Topology of Information...,dynagrag,21,graph,20,cid,14,llms,13,inarxiv,...,diversity,7,intelligence,7,language,6,like,6,flash,6
Exploring Gradient Subspaces Addressing and Overcoming LoRA's Limitations in Federated Fine-Tuning of Large Language Models.pdf,Exploring Gradient Subspaces Addressing and Ov...,cid,276,lora,88,agg,82,clients,56,fedftg,...,yes,23,across,22,dolly,22,medquad,22,language,20
Hyperbolic Contrastive Learning for Hierarchical 3D Point Cloud Embedding.pdf,Hyperbolic Contrastive Learning for Hierarchic...,point,112,embeddings,102,cloud,76,text,65,learning,...,pages,25,cid,23,training,22,loss,22,clip,22
INFELM In-depth Fairness Evaluation of Large Text-To-Image Models.pdf,INFELM In-depth Fairness Evaluation of Large T...,cid,16,image,15,models,12,figure,12,skintone,...,monk,7,table,7,classification,7,stablediffusionv,7,large,6


In [82]:
df_top_words_new = df_top_words.reset_index(drop=True)
df_top_words_new = df_top_words_new.drop(["level_0","index"],axis=1)
df_top_words_new

Unnamed: 0,file_name,most_recent_word1,frequency1,most_recent_word2,frequency2,most_recent_word3,frequency3,most_recent_word4,frequency4,most_recent_word5,...,most_recent_word16,frequency16,most_recent_word17,frequency17,most_recent_word18,frequency18,most_recent_word19,frequency19,most_recent_word20,frequency20
0,A Soft Sensor Method with Uncertainty-Awarenes...,llm,178,soft,98,ufss,91,data,86,fig,...,variables,50,confidence,46,methods,45,prediction,45,pt,42
1,AI-Driven Diabetic Retinopathy Screening Multi...,aidrss,32,screening,20,specificity,20,diabetic,19,retinopathy,...,fundus,11,using,11,feature,11,fig,10,india,9
2,Constraints as Rewards Reinforcement Learning ...,cid,57,constraint,53,robot,44,learning,39,task,...,design,20,proposed,20,method,19,qrsac,19,trained,17
3,CONTINUUM Detecting APT Attacks through Spatia...,graph,65,data,44,model,38,detection,33,node,...,spatial,19,using,19,benign,19,ids,18,federated,18
4,DiReCT Diagnostic Reasoning for Clinical Notes...,gpt,41,response,39,observation,38,disease,37,note,...,stroke,17,figure,16,diagnostic,15,however,15,respectively,15
5,"DPO Kernels A Semantically-Aware, Kernel-Enha...",cid,812,kernel,255,log,175,kernels,154,rbf,...,yy,68,based,67,model,61,data,60,local,60
6,DynaGRAG Exploring the Topology of Information...,dynagrag,21,graph,20,cid,14,llms,13,inarxiv,...,diversity,7,intelligence,7,language,6,like,6,flash,6
7,Exploring Gradient Subspaces Addressing and Ov...,cid,276,lora,88,agg,82,clients,56,fedftg,...,yes,23,across,22,dolly,22,medquad,22,language,20
8,Hyperbolic Contrastive Learning for Hierarchic...,point,112,embeddings,102,cloud,76,text,65,learning,...,pages,25,cid,23,training,22,loss,22,clip,22
9,INFELM In-depth Fairness Evaluation of Large T...,cid,16,image,15,models,12,figure,12,skintone,...,monk,7,table,7,classification,7,stablediffusionv,7,large,6


In [83]:
# ここから要約を行う。要約列を追加する。

In [84]:
# または raw文字列を使う方法
desk_top_path = r"C:\Users\sk062\OneDrive\デスクトップ\output.csv"

# データフレームをCSVとして保存
df_top_words_new.to_csv(desk_top_path, index=False)