In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
import os
import pandas as pd
import tiktoken
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import html
from datetime import datetime
from langchain_openai import OpenAIEmbeddings

In [None]:
pdf_folder = "C:/Users/yasar/Belgeler/articles"  # Kendi bilgisayarınızdaki yol

In [None]:
openai_api_key= os.getenv("OPENAI_API_KEY_OZU")

In [None]:
# 3. PDF'leri yükle, metinleri parçalara ayır
all_docs = []
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        docs = loader.load()
        # PDF kaynak bilgisini metadataya ekle
        for doc in docs:
            doc.metadata['source_file'] = filename
        all_docs.extend(docs)

text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=200)
split_docs = text_splitter.split_documents(all_docs)

# Token sayısını kontrol et (isteğe bağlı)
encoding = tiktoken.get_encoding("cl100k_base")
for i, doc in enumerate(split_docs[:10]):  # İlk 10 chunk'ı göster
    tokens = encoding.encode(doc.page_content)
    print(f"Chunk {i}: {len(tokens)} tokens")

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


In [None]:
# Chroma veritabanını batch'ler halinde oluştur
from langchain.vectorstores import Chroma

print("Chroma veritabanı batch'ler halinde oluşturuluyor...")

batch_size = 100
persist_directory = "./chroma_mof_db"
db = None

for i in range(0, len(split_docs), batch_size):
    batch = split_docs[i:i+batch_size]
    if db is None:
        db = Chroma.from_documents(
            documents=batch,
            embedding=embeddings,
            persist_directory=persist_directory
        )
    else:
        db.add_documents(batch)
    print(f"{i+len(batch)}/{len(split_docs)} parça işlendi")

db.persist()
print(f"{len(split_docs)} parça başarıyla işlendi ve veritabanına kaydedildi")

In [None]:
# 5. MOF toksisitesiyle ilgili bölümleri sorgula (Gelişmiş sorgu)
toxicity_queries = [
    "MOF toxicity and cytotoxicity in biological systems",
    "Toxic effects of metal-organic frameworks on cells",
    "MOF biocompatibility and safety assessment",
    "Cytotoxicity evaluation of metal-organic frameworks",
    "Adverse health effects of MOFs",
    "MOF-induced cell viability reduction",
    "Toxicological profile of MOFs",
    "MOF exposure and biological response",
    "Toxicity mechanisms of metal-organic frameworks",
    "MOF nanomaterials and their biocompatibility"
]

# Birden fazla sorgu ile daha kapsamlı sonuçlar al
all_retrieved_docs = []
for query in toxicity_queries:
    retrieved_docs = db.similarity_search(query, k=4)
    all_retrieved_docs.extend(retrieved_docs)

# Benzersiz dokümanları al (duplikasyonları önle)
unique_docs = []
seen_content = set()
for doc in all_retrieved_docs:
    if doc.page_content not in seen_content:
        unique_docs.append(doc)
        seen_content.add(doc.page_content)

print(f"Toplam {len(unique_docs)} benzersiz doküman bulundu")

# 6. ChatGPT 4o Mini ile çoktan seçmeli soru üret
llm = ChatOpenAI(openai_api_key=openai_api_key, model_name="gpt-4o-mini")

In [None]:
def generate_mcq(context, source_file):
    prompt = f"""
Based on the following scientific text about MOF toxicity, generate a challenging multiple-choice question (MCQ) with 4 options (A, B, C, D) related to the toxicity, cytotoxicity, or biocompatibility of MOFs. The question and options should be clear, relevant, and plausible. Indicate the correct answer at the end.

Text:
\"\"\"{context}\"\"\"
Source: {source_file}

Output format:
Question: ...
A) ...
B) ...
C) ...
D) ...
Correct answer: ...
"""
    return llm.predict(prompt)

# 7. Sonuçları CSV'ye kaydet
results = []
for doc in unique_docs:
    source_file = doc.metadata.get('source_file', 'Bilinmeyen kaynak')
    mcq = generate_mcq(doc.page_content, source_file)
    results.append({
        "context": doc.page_content,
        "source_file": source_file,
        "page_number": doc.metadata.get('page', 'Bilinmeyen'),
        "mcq": mcq
    })

df = pd.DataFrame(results)
df.to_csv("mof_toxicity_mcq_benchmark.csv", index=False)
print("CSV dosyası kaydedildi: mof_toxicity_mcq_benchmark.csv")

# 8. HTML raporu oluştur (highlight edilmiş içerik)
def highlight_keywords(text, keywords):
    """Metinde anahtar kelimeleri highlight et"""
    for keyword in keywords:
        text = text.replace(keyword, f'<span style="background-color: yellow; font-weight: bold;">{keyword}</span>')
    return text

def generate_html_report(results):
    html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>MOF Toxicity MCQ Report</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; }}
        .question-block {{ border: 1px solid #ddd; padding: 20px; margin: 20px 0; }}
        .context {{ background-color: #f5f5f5; padding: 15px; margin: 10px 0; }}
        .highlight {{ background-color: yellow; font-weight: bold; }}
        .source-info {{ color: #666; font-size: 0.9em; }}
        .mcq {{ background-color: #f9f9f9; padding: 10px; margin: 10px 0; }}
        h1 {{ color: #333; }}
        h2 {{ color: #555; }}
    </style>
</head>
<body>
    <h1>MOF Toxicity MCQ Benchmark Report</h1>
    <p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    <p>Total questions: {len(results)}</p>
    <hr>
"""

    toxicity_keywords = ["toxic", "toxicity", "cytotoxic", "biocompatible", "viability", "safety", "harmful", "cytotoxicity"]

    for i, result in enumerate(results, 1):
        highlighted_context = highlight_keywords(result['context'], toxicity_keywords)

        html_content += f"""
        <div class="question-block">
            <h2>Question {i}</h2>
            <div class="source-info">
                <strong>Source:</strong> {result['source_file']} |
                <strong>Page:</strong> {result['page_number']}
            </div>

            <h3>Context:</h3>
            <div class="context">{highlighted_context}</div>

            <h3>Generated MCQ:</h3>
            <div class="mcq">
                <pre>{html.escape(result['mcq'])}</pre>
            </div>
        </div>
        <hr>
        """

    html_content += """
    </body>
    </html>
    """

    return html_content

# HTML raporu kaydet
html_report = generate_html_report(results)
with open("mof_toxicity_mcq_report.html", "w", encoding="utf-8") as f:
    f.write(html_report)
print("HTML raporu kaydedildi: mof_toxicity_mcq_report.html")
