In [None]:
!pip install langchain langchain-community pypdf langchain-ollama faiss-cpu langchain-huggingface

In [None]:
!ollama run llama3
# または
!ollama run mistral

In [None]:
# Updated extraction notebook with Data Types and Methods extraction
import os
import glob
import csv
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# モデル設定
MODEL_NAME = "llama3"
llm = OllamaLLM(model=MODEL_NAME)

# ディレクトリ設定
PDF_DIR = "../data/raw"
OUTPUT_CAUSAL = "../data/processed/causal_extraction_results.csv"
OUTPUT_META   = "../data/processed/paper_metadata.csv"
os.makedirs(os.path.dirname(OUTPUT_CAUSAL), exist_ok=True)
os.makedirs(os.path.dirname(OUTPUT_META), exist_ok=True)

# 埋め込みモデル
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Prompts
CAUSAL_PROMPT = """\
You are a structured information extraction engine.  
From the given document, extract **only** causal relationships—no greetings, no commentary, nothing else.  
Output **exactly** one line per relation in CSV format with **five** columns in this order:

Cause,Effect,Description,Effect_Polarity,Edge_Label

Where:
- **Cause** and **Effect** are the variable names.
- **Description** is the full text explanation.
- **Effect_Polarity** is one of: Positive, Negative, Both, Unknown (choose based on whether the causal effect is positive, negative, both directions, or unclear).
- **Edge_Label** is a very short summary of the relation, lowercase, words joined with underscores, no spaces (e.g., “accelerates_mosquito_breeding”).

Example line:
- High temperature,Increased malaria transmission,Temperature rise accelerates mosquito breeding.,Positive,accelerates_mosquito_breeding
- High temperature,Income inequality,High temperature stimulates income inequality.,Positive,stimulates
- High temperature,Income inequality,High temperature is relevant to income inequality.,Unknown,is_relevant_to

**Do not** output any additional text or columns."""

DATA_PROMPT = """\
You are an information extraction engine.  
List **only** the types of data used in this paper—no greetings, no commentary, nothing else.  
Output **exactly** a single comma-separated line of data types, e.g.:

satellite data, patient-level data, survey data

**Do not** output any additional text."""


METHODS_PROMPT = """\
You are an information extraction engine.  
List **only** the research methods used in this paper—no greetings, no commentary, nothing else.  
Output **exactly** a single comma-separated line of methods, e.g.:

regression analysis, time-series analysis, differential equation models

**Do not** output any additional text."""


# CSVファイルを開いてヘッダを書き込む
with open(OUTPUT_CAUSAL, 'w', newline='', encoding='utf-8-sig') as causal_f, \
     open(OUTPUT_META, 'w', newline='', encoding='utf-8-sig') as meta_f:

    causal_writer = csv.writer(causal_f)
    causal_writer.writerow([
        "paper_id","cause","effect","description","effect_polarity","edge_label"
    ])
    meta_writer = csv.writer(meta_f)
    meta_writer.writerow(["paper_id","data_types","methods"])

    # 各PDFを処理
    for pdf_path in glob.glob(os.path.join(PDF_DIR, "*.pdf")):
        paper_id = os.path.splitext(os.path.basename(pdf_path))[0]
        print(f"Processing: {paper_id}")

        # ドキュメントロードと分割
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        split_docs = splitter.split_documents(docs)

        # Retriever + QAチェーン
        vectorstore = FAISS.from_documents(split_docs, embedding_model)
        retriever = vectorstore.as_retriever()
        qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

        # 因果関係抽出
        resp = qa.invoke({"query": CAUSAL_PROMPT})
        text = resp["result"] if isinstance(resp, dict) else resp
        for row in csv.reader(text.splitlines()):
            if len(row) == 5:
                causal_writer.writerow([paper_id] + [cell.strip() for cell in row])
                causal_f.flush()

        # データ種別抽出
        data_resp = qa.invoke({"query": DATA_PROMPT})
        data_line = (data_resp["result"] if isinstance(data_resp, dict) else data_resp).strip()

        # 手法抽出
        meth_resp = qa.invoke({"query": METHODS_PROMPT})
        meth_line = (meth_resp["result"] if isinstance(meth_resp, dict) else meth_resp).strip()

        # メタ情報を書き込む
        meta_writer.writerow([paper_id, data_line, meth_line])
        meta_f.flush()

print("Incremental CSV writing completed.")


  from .autonotebook import tqdm as notebook_tqdm


Processing: 10.21149_11984
Processing: 10.47205_jdss.2021(2-iv)74
Processing: https___doi.org_10.1007_978-3-642-17776-7_2
Processing: https___doi.org_10.1007_s00148-022-00924-y
Processing: https___doi.org_10.1007_s00181-023-02460-4
Processing: https___doi.org_10.1007_s10113-014-0688-7
Processing: https___doi.org_10.1007_s10584-020-02692-8
Processing: https___doi.org_10.1007_s10584-021-03226-6
Processing: https___doi.org_10.1007_s10680-012-9277-y
Processing: https___doi.org_10.1007_s11111-013-0201-0
Processing: https___doi.org_10.1007_s11912-016-0539-4
Processing: https___doi.org_10.1007_s12571-019-00968-1
Processing: https___doi.org_10.1007_s40641-018-0089-y
Processing: https___doi.org_10.1007_s42452-020-03705-y
Processing: https___doi.org_10.1016_j.annals.2012.05.023
Processing: https___doi.org_10.1016_j.apr.2017.06.004
Processing: https___doi.org_10.1016_j.earscirev.2009.05.001
Processing: https___doi.org_10.1016_j.ecolecon.2016.08.008
Processing: https___doi.org_10.1016_j.techfore.2

Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.


Processing: https___doi.org_10.3386_w20352


Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label for 68.
Could not reliably determine page label for 66.
Could not reliably determine page label for 67.
Could not reliably determine page label 

Processing: https___doi.org_10.3386_w20750
Processing: https___doi.org_10.3386_w26167
Processing: https___doi.org_10.3389_fenrg.2021.739721
Processing: https___doi.org_10.3389_fenvs.2022.879681
Processing: https___doi.org_10.3389_fhumd.2023.1121662
Processing: https___doi.org_10.3389_fnsys.2015.00151
Processing: https___doi.org_10.4310_cms.2010.v8.n1.a10


Multiple definitions in dictionary at byte 0x4fc41c for key /Creator


Processing: https___doi.org_10.5194_acp-12-2117-2012
Processing: https___doi.org_10.5194_gmd-14-5269-2021
Processing: https___doi.org_10.5194_hess-18-2735-2014
Processing: https___doi.org_10.5194_hess-19-3667-2015
Processing: https___doi.org_10.5194_wcd-1-261-2020
Processing: https___doi.org_10.5751_es-06528-190268
Processing: https___doi.org_10.5751_es-11103-240334
Incremental CSV writing completed.


## 手動での確認、調整

In [10]:
import pandas as pd
tw01 = pd.read_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/causal_extraction_results_01.csv", sep=",")
tw02 = pd.read_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/causal_extraction_results_02.csv", sep=",")
pd.concat([tw01,tw02]).reset_index(drop=True).to_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/processed/causal_extraction_results.csv", sep=",", encoding='utf-8-sig')

In [26]:
tw01 = pd.read_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/paper_metadata_01.csv", sep=",")
tw02 = pd.read_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/paper_metadata_02.csv", sep=",")
pd.concat([tw01,tw02]).reset_index(drop=True).to_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/paper_metadata.csv", sep=",", encoding='utf-8-sig')

In [27]:
tw = pd.read_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/_backup/paper_metadata.csv", sep=",", encoding='utf-8-sig')
tw[tw['paper_id'].duplicated()]

Unnamed: 0.1,Unnamed: 0,paper_id,data_types,methods
197,197,10.21149_11984,"patient-level data, health data, disease data,...",systematic review
198,198,10.47205_jdss.2021(2-iv)74,"official documents, journal articles, reports,...","multi-stage analysis approach, document analys..."


In [28]:
tw01 = tw[tw['paper_id'].isin(['10.21149_11984','10.47205_jdss.2021(2-iv)74'])]
tw02 = tw[~(tw['paper_id'].isin(['10.21149_11984','10.47205_jdss.2021(2-iv)74']))]

In [29]:
tw03 = pd.concat([tw01.head(n=2),tw02]).reset_index(drop=True)
del tw03['Unnamed: 0']
tw03

Unnamed: 0,paper_id,data_types,methods
0,10.21149_11984,"patient-level data, clinical diagnosis data, l...","systematic review, literature search"
1,10.47205_jdss.2021(2-iv)74,"documents, reports, accord, treaties, formulas...","multi-stage analysis approach, document analys..."
2,10.1007_s00431-023-04988-0,"article search results, data extraction, manus...","keyword search, PubMed, Embase, Google Scholar..."
3,10.1007_s10461-020-02962-7,"sociodemographic data, HIV screening and testi...","univariable analysis, multivariable logistic r..."
4,10.1007_s10461-020-03068-w,"survey data, demographic variables, psychologi...","Descriptive statistics, bivariate analyses, Pe..."
...,...,...,...
254,https___doi.org_10.5194_hess-18-2735-2014,"time series, documentary sources, historical f...","information-theoretical measures, empirical mo..."
255,https___doi.org_10.5194_hess-19-3667-2015,"numerical records, pictorial information, arch...","statistical–empirical analyses, empirical mode..."
256,https___doi.org_10.5194_wcd-1-261-2020,"reanalysis datasets, code availability, ERA-In...","Causal Effect Networks (CEN), regression analy..."
257,https___doi.org_10.5751_es-06528-190268,"Historical flood & drought records, lake surfa...","Granger Causality Analysis, regression analysi..."


In [30]:
tw03.to_csv("C:/Users/user/OneDrive/Desktop/causal_review_project/data/processed/paper_metadata.csv", sep=",", encoding='utf-8-sig')