In [54]:
from openai import OpenAI
from dotenv import load_dotenv
from random import randint
import os

In [60]:
real_data = load_dataset("csv", data_files="../datasets/FakeNewsDetection/fakenewsdetection_real.csv")["train"]
real_data = real_data.add_column("label", [0] * len(real_data))

fake_data = load_dataset("csv", data_files="../datasets/FakeNewsDetection/fakenewsdetection_fake.csv")["train"]
fake_data = fake_data.add_column("label", [1] * len(fake_data))

data = concatenate_datasets([real_data, fake_data])

NO_SAMPLES = 100
data = data.shuffle(seed=randint(1, 100)).select(range(NO_SAMPLES))

In [61]:
def combine_title_and_text(data):
    data["full_text"] = data["title"] + " " + data["text"]
    return data

data = data.map(combine_title_and_text)
data = data.remove_columns(["title", "text", "subject", "date"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [62]:

load_dotenv()

prompts = [
    {
        "text": sample["full_text"],
        "label_orig": "real" if sample["label"] == 0 else "fake"
    }
    for sample in data
]

client = OpenAI(api_key=os.getenv("OPENAI_SECRET_KEY"))

responses = []
def detect_fake_news(prompts):
    for n, prompt in enumerate(prompts):
        print(f"{n} / {NO_SAMPLES}")
        response = client.responses.create(
            model="gpt-4.1",
            input=[
                {"role": "system", "content": "You are a fake news detector. Respond with only 'real' or 'fake'."},
                {
                    "role": "user",
                    "content": prompt["text"],
                },
            ],
        )
        label_pred = response.output_text.lower()
        responses.append(
            {"prompt": prompt["text"], 
             "label_original": prompt["label_orig"], 
             "label_predicted": label_pred
             }
        )
    return responses

results = detect_fake_news(prompts)

0 / 100
1 / 100
2 / 100
3 / 100
4 / 100
5 / 100
6 / 100
7 / 100
8 / 100
9 / 100
10 / 100
11 / 100
12 / 100
13 / 100
14 / 100
15 / 100
16 / 100
17 / 100
18 / 100
19 / 100
20 / 100
21 / 100
22 / 100
23 / 100
24 / 100
25 / 100
26 / 100
27 / 100
28 / 100
29 / 100
30 / 100
31 / 100
32 / 100
33 / 100
34 / 100
35 / 100
36 / 100
37 / 100
38 / 100
39 / 100
40 / 100
41 / 100
42 / 100
43 / 100
44 / 100
45 / 100
46 / 100
47 / 100
48 / 100
49 / 100
50 / 100
51 / 100
52 / 100
53 / 100
54 / 100
55 / 100
56 / 100
57 / 100
58 / 100
59 / 100
60 / 100
61 / 100
62 / 100
63 / 100
64 / 100
65 / 100
66 / 100
67 / 100
68 / 100
69 / 100
70 / 100
71 / 100
72 / 100
73 / 100
74 / 100
75 / 100
76 / 100
77 / 100
78 / 100
79 / 100
80 / 100
81 / 100
82 / 100
83 / 100
84 / 100
85 / 100
86 / 100
87 / 100
88 / 100
89 / 100
90 / 100
91 / 100
92 / 100
93 / 100
94 / 100
95 / 100
96 / 100
97 / 100
98 / 100
99 / 100


In [66]:
original_length = len(responses)

responses = [
    response for response in responses
    if response["label_predicted"] in ("real", "fake")
]

print(f"Original length: {original_length} - updated length: {len(responses)} - removed {original_length - len(responses)} item(s)")


Original length: 100 - updated length: 100 - removed 0 item(s)


In [68]:
correctly_predicted = sum(
    1 for response in responses if response["label_original"] == response["label_predicted"]
)

print(f"{correctly_predicted} / {NO_SAMPLES} - Accuracy: {(correctly_predicted / NO_SAMPLES) * 100:.2f}%")

73 / 100 - Accuracy: 73.00%
