In [1]:
import pandas as pd
import anthropic
from anthropic import Client
import json

In [2]:
# Datei laden
file_path = './Dataextraktion_Durchführung.xlsx'
data = pd.read_excel(file_path)

In [3]:
# Alle relevanten Felder auslesen
data_all_RQ = data.loc[0:524, ['ID', 'Title', 'RQ1', 'RQ2', 'RQ3', 'RQ4']]

# ID als Integer ohne Dezimalstellen und RQ1 als String oder NaN einlesen
data_all_RQ['ID'] = data_all_RQ['ID'].astype(int)
data_all_RQ['Title'] = data_all_RQ['Title'].astype(str)
data_all_RQ['RQ1'] = data_all_RQ['RQ1'].astype(str) # NaN wird zu 'nan'


In [4]:
# RQ1, RQ2, RQ3, RQ4 in einzelne DataFrames aufteilen
data_RQ1 = data_all_RQ.loc[0:524, ['ID', 'Title', 'RQ1']]
data_RQ2 = data_all_RQ.loc[0:524, ['ID', 'Title', 'RQ2']]
data_RQ3 = data_all_RQ.loc[0:524, ['ID', 'Title', 'RQ3']]
data_RQ4 = data_all_RQ.loc[0:524, ['ID', 'Title', 'RQ4']]

# data_ID_RQ1 zu einer Liste umwandeln
data_RQ1_list = data_RQ1.values.tolist()
data_RQ2_list = data_RQ2.values.tolist()
data_RQ3_list = data_RQ3.values.tolist()
data_RQ4_list = data_RQ4.values.tolist()

In [5]:
systemprompt = "You are an AI assistant conducting a systematic literature review. Your task is to summarize key insights from all the provided data and associate all sources to them."

In [6]:
# Prompt-Vorbereitung für RQ1
prompt_RQ1 = f"""
    Extract the information from the 'data_list' containing a list of document IDs with their content to answer to 'review_questions' RQ1. Follow the instructions below!

    <review_questions>
            - **RQ1**: Which machine learning methods are used for deepfake detection? (e.g. Logistic Regression, Support Vector Machines (SVMs), Convolutional Neural Networks (CNNs), Generative Adversarial Networks (GANs) etc)  
    </review_questions>
    
    1. Review the data provided in <Data>.
    2. Ensure **each item** in the 'data_list' is processed and its content is considered.
        - For each document in the list, attempt to extract key findings related to 'review_questions' RQ1.
        - If no findings can be extracted from a specific document, include a note explaining the reason.
    3. Aggregate all insights related to 'RQ1' into distinct bullet points. For each finding:
        - Summarize the insight concisely.
        - Include a list of all IDs that contributed to that insight for traceability.
    4. Construct a structured JSON object using the format in <desired_output> to represent the findings.

    <Data>
    {data_RQ1_list}
    </Data>

    <instructions>
    - Ensure **every document ID in the list is processed** and accounted for, either by extracting insights or noting that no relevant data was found.
    - Ensure each key insight in the bullet points is distinct and directly related to RQ1.
    - Group IDs under the respective bullet point to show which data contributed to the insight.
    - If no insights can be drawn from the provided data, return an empty list for the insights and explain why for each unprocessed ID.
    </instructions>

    <desired_output>
    {{
        "Insights": [
            {{
                "insight": "Key finding here",
                "references": ["1", "25", "341"]
            }},
            {{
                "insight": "Another key finding here",
                "references": ["433"]
            }}
        ],
        "Unprocessed": [
            {{
                "ID": "7",
                "reason": "No relevant data found in the document."
            }}
        ]
    }}
    </desired_output>

    Return only the JSON object in the format specified above.
    """

In [7]:
# Prompt-Vorbereitung für RQ2
prompt_RQ2 = f"""
    Extract the information from the 'data_list' containing a list of document IDs with their content to answer to 'review_questions' RQ2. Follow the instructions below!

    <review_questions>
            - **RQ2**: Which machine learning methods are recommended (and thus particularly suitable)? (e.g. Logistic Regression, Support Vector Machines (SVMs), Convolutional Neural Networks (CNNs), Generative Adversarial Networks (GANs) etc)
    </review_questions>
    
    1. Review the data provided in <Data>.
    2. Ensure **each item** in the 'data_list' is processed and its content is considered.
        - For each document in the list, attempt to extract key findings related to 'review_questions' RQ2.
        - If no findings can be extracted from a specific document, include a note explaining the reason.
    3. Aggregate all insights related to 'RQ2' into distinct bullet points. For each finding:
        - Summarize the insight concisely.
        - Include a list of all IDs that contributed to that insight for traceability.
    4. Construct a structured JSON object using the format in <desired_output> to represent the findings.

    <Data>
    {data_RQ2_list}
    </Data>

    <instructions>
    - Ensure **every document ID in the list is processed** and accounted for, either by extracting insights or noting that no relevant data was found.
    - Ensure each key insight in the bullet points is distinct and directly related to RQ2.
    - Group IDs under the respective bullet point to show which data contributed to the insight.
    - If no insights can be drawn from the provided data, return an empty list for the insights and explain why for each unprocessed ID.
    </instructions>

    <desired_output>
    {{
        "Insights": [
            {{
                "insight": "Key finding here",
                "references": ["1", "25", "341"]
            }},
            {{
                "insight": "Another key finding here",
                "references": ["433"]
            }}
        ],
        "Unprocessed": [
            {{
                "ID": "7",
                "reason": "No relevant data found in the document."
            }}
        ]
    }}
    </desired_output>

    Return only the JSON object in the format specified above.
    """

In [8]:
# Prompt-Vorbereitung für RQ3
prompt_RQ3 = f"""
    Extract the information from the 'data_list' containing a list of document IDs with their content to answer to 'review_questions' RQ3. Follow the instructions below!

    <review_questions>
            - **RQ3**: What challenges exist in detecting deepfakes using machine learning approaches? (e.g. outdated or limited datasets, Generalization issues across datasets, Rapid evolution of deepfake techniques, High computational cost of detection models, Difficulty in detecting low-quality or compressed media etc)
    </review_questions>
    
    1. Review the data provided in <Data>.
    2. Ensure **each item** in the 'data_list' is processed and its content is considered.
        - For each document in the list, attempt to extract key findings related to 'review_questions' RQ3.
        - If no findings can be extracted from a specific document, include a note explaining the reason.
    3. Aggregate all insights related to 'RQ3' into distinct bullet points. For each finding:
        - Summarize the insight concisely.
        - Include a list of all IDs that contributed to that insight for traceability.
    4. Construct a structured JSON object using the format in <desired_output> to represent the findings.

    <Data>
    {data_RQ3_list}
    </Data>

    <instructions>
    - Ensure **every document ID in the list is processed** and accounted for, either by extracting insights or noting that no relevant data was found.
    - Ensure each key insight in the bullet points is distinct and directly related to RQ3.
    - Group IDs under the respective bullet point to show which data contributed to the insight.
    - If no insights can be drawn from the provided data, return an empty list for the insights and explain why for each unprocessed ID.
    </instructions>

    <desired_output>
    {{
        "Insights": [
            {{
                "insight": "Key finding here",
                "references": ["1", "25", "341"]
            }},
            {{
                "insight": "Another key finding here",
                "references": ["433"]
            }}
        ],
        "Unprocessed": [
            {{
                "ID": "7",
                "reason": "No relevant data found in the document."
            }}
        ]
    }}
    </desired_output>

    Return only the JSON object in the format specified above.
    """

In [9]:
# Prompt-Vorbereitung für RQ4
prompt_RQ4 = f"""
    Extract the information from the 'data_list' containing a list of document IDs with their content to answer to 'review_questions' RQ4. Follow the instructions below!

    <review_questions>
            - **RQ4**: What are the use cases for deepfake detection? (e.g., COVID-19 masks, medicine, media, politics etc)  
    </review_questions>
    
    1. Review the data provided in <Data>.
    2. Ensure **each item** in the 'data_list' is processed and its content is considered.
        - For each document in the list, attempt to extract key findings related to 'review_questions' RQ4.
        - If no findings can be extracted from a specific document, include a note explaining the reason.
    3. Aggregate all insights related to 'RQ4' into distinct bullet points. For each finding:
        - Summarize the insight concisely.
        - Include a list of all IDs that contributed to that insight for traceability.
    4. Construct a structured JSON object using the format in <desired_output> to represent the findings.

    <Data>
    {data_RQ4_list}
    </Data>

    <instructions>
    - Ensure **every document ID in the list is processed** and accounted for, either by extracting insights or noting that no relevant data was found.
    - Ensure each key insight in the bullet points is distinct and directly related to RQ4.
    - Group IDs under the respective bullet point to show which data contributed to the insight.
    - If no insights can be drawn from the provided data, return an empty list for the insights and explain why for each unprocessed ID.
    </instructions>

    <desired_output>
    {{
        "Insights": [
            {{
                "insight": "Key finding here",
                "references": ["1", "25", "341"]
            }},
            {{
                "insight": "Another key finding here",
                "references": ["433"]
            }}
        ],
        "Unprocessed": [
            {{
                "ID": "7",
                "reason": "No relevant data found in the document."
            }}
        ]
    }}
    </desired_output>

    Return only the JSON object in the format specified above.
    """

In [10]:
# Claude AI API-Aufruf
client = Client(api_key="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")  # Ersetze durch deinen Claude API-Schlüssel

In [11]:
# Abfrage für RQ1
response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=2048,
    system=systemprompt,
    temperature=0,
    messages=[
        {"role": "user", "content": prompt_RQ1}
    ]
)

# JSON-Output parsen
response_data = response.content[0].text
json_output = json.loads(response_data)

# JSON-Daten in DataFrame umwandeln
insights = json_output.get("Insights", [])
output_data = []

for entry in insights:
    output_data.append({
        "Insight": entry.get("insight", ""),
        "References": ", ".join(entry.get("references", []))
    })

output_df = pd.DataFrame(output_data)

# DataFrame als Excel speichern
output_excel_path = "./Output_RQ1.xlsx"
output_df.to_excel(output_excel_path, index=False)

print("Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.")


Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.


In [12]:
# Abfrage für RQ2
response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=2048,
    system=systemprompt,
    temperature=0,
    messages=[
        {"role": "user", "content": prompt_RQ2}
    ]
)

# JSON-Output parsen
response_data = response.content[0].text
json_output = json.loads(response_data)

# JSON-Daten in DataFrame umwandeln
insights = json_output.get("Insights", [])
output_data = []

for entry in insights:
    output_data.append({
        "Insight": entry.get("insight", ""),
        "References": ", ".join(entry.get("references", []))
    })

output_df = pd.DataFrame(output_data)

# DataFrame als Excel speichern
output_excel_path = "./Output_RQ2.xlsx"
output_df.to_excel(output_excel_path, index=False)

print("Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.")


Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.


In [13]:
# Abfrage für RQ3
response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=2048,
    system=systemprompt,
    temperature=0,
    messages=[
        {"role": "user", "content": prompt_RQ3}
    ]
)

# JSON-Output parsen
response_data = response.content[0].text
json_output = json.loads(response_data)

# JSON-Daten in DataFrame umwandeln
insights = json_output.get("Insights", [])
output_data = []

for entry in insights:
    output_data.append({
        "Insight": entry.get("insight", ""),
        "References": ", ".join(entry.get("references", []))
    })

output_df = pd.DataFrame(output_data)

# DataFrame als Excel speichern
output_excel_path = "./Output_RQ3.xlsx"
output_df.to_excel(output_excel_path, index=False)

print("Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.")


Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.


In [14]:
# Abfrage für RQ4
response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=2048,
    system=systemprompt,
    temperature=0,
    messages=[
        {"role": "user", "content": prompt_RQ4}
    ]
)

# JSON-Output parsen
response_data = response.content[0].text
json_output = json.loads(response_data)

# JSON-Daten in DataFrame umwandeln
insights = json_output.get("Insights", [])
output_data = []

for entry in insights:
    output_data.append({
        "Insight": entry.get("insight", ""),
        "References": ", ".join(entry.get("references", []))
    })

output_df = pd.DataFrame(output_data)

# DataFrame als Excel speichern
output_excel_path = "./Output_RQ4.xlsx"
output_df.to_excel(output_excel_path, index=False)

print("Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.")


Verarbeitung abgeschlossen und Excel-Datei heruntergeladen.
