In [19]:
import os
import re
import json
import numpy as np
import pandas as pd
from config import Config
import openai
from openai import OpenAI
from dotenv import load_dotenv
from topic_modeling.topic_modeling import TopicModeling

In [15]:
api_key = Config.OPENAI_API_KEY

In [20]:
load_dotenv()

True

In [21]:
api_key = os.getenv("OPENAI_API_KEY_Ahmed")

In [24]:
api_key

'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'

In [22]:
client = OpenAI(api_key=api_key)

In [25]:
prompt = f"""
    What is the capital of Australia?
    """
response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are an expert that provides multiple perspectives on statements."},
                  {"role": "user", "content": prompt}],
        temperature=0.7
    )

In [27]:
response_text = response.choices[0].message.content.strip()

In [28]:
response_text

'The capital of Australia is Canberra. This city was selected as a compromise between Sydney and Melbourne, two major cities that were both vying to be the capital. Located in the Australian Capital Territory, Canberra is the political center of the country, housing important government institutions such as the Parliament House and the High Court of Australia.'

In [None]:
class GptTopicModeling(TopicModeling):
    """
    Topic Modeling using the Mistral model via LangChain's OllamaLLM.
    Dynamically generates topics based on input data and a user-defined prompt.
    """
    def __init__(self, api_key: str=api_key, num_topics: int = 5, model_name: str = "gpt-4o"):
        self.num_topics = num_topics
        self.client = OpenAI(api_key=api_key)
        self.model_name = model_name

    def preprocess(self, text):
        """Clean the input text by removing links, tags, and emojis."""
        text = re.sub(r"http\S+", "", text)  # Remove links.
        text = re.sub(r"@\S+", "", text)  # Remove tags.
        text = re.sub(r"\w+:\s?", "", text)  # Remove author at the start.
        text = self.__remove_emojis(text)

        text = re.sub(r"@\w+", "", text)  # Remove mentions
        text = re.sub(r"#\w+", "", text)  # Remove hashtags
        text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove non-alphanumeric characters
        text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text

    def __remove_emojis(self, text):
        """Remove emojis from text."""
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # Emoticons.
                                   u"\U0001F300-\U0001F5FF"  # Symbols & pictographs.
                                   u"\U0001F680-\U0001F6FF"  # Transport & map symbols.
                                   u"\U0001F1E0-\U0001F1FF"  # Flags.
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    
    # Function to clean and extract JSON from a response
    def extract_json(self, response_text):
        try:
        # Use a regex to find the JSON object within the response
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
        return None
    def clean_response(self, response):
        response = response.replace("\n", "")
        cleaned_str = re.sub(r"```json|```", "", response)  # Remove backticks and "```json"
        cleaned_str = re.sub(r"\\", "", cleaned_str)  # Remove backslashes
        cleaned_str = re.sub(r"\'", '"', cleaned_str)  # Replace escaped single quotes (\' -> ")
        cleaned_str = re.sub(r'"s ', ' ', cleaned_str)  # Replace "s with space
        cleaned_str = re.sub(r"\s+", " ", cleaned_str)  # Replace multiple spaces with single space
        return cleaned_str.strip()
        
    def get_topics(self, doc):
        """
        Generate topics dynamically using the Mistral model.
        Returns topic probabilities, keywords, topic names, and topic details.
        """
        preprocessed_doc = self.preprocess(doc)
        prompt = f"""
        Analyze the following text and provide at least {self.num_topics} most suitable topics for it, along with their percentages and the relevant 15 keywords.
        Text:
        {preprocessed_doc}
        Format the output as a JSON object: {{"topics": [{{"name": "Topic1", "percentage": 25.0, "keywords": ['keyword1','keyword2',..]}}, ...]}}
        """
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a helpful NLP assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7
            )
            response_text = response.choices[0].message.content.strip()
            cleaned_response = self.clean_response(response_text)
            data = self.extract_json(cleaned_response)

            if not data:
                raise ValueError("No valid JSON extracted from model response.")

            topics_and_keywords = data["topics"]
            topics = [topic["name"] for topic in topics_and_keywords]
            probs = {topic["name"]: topic["percentage"] for topic in topics_and_keywords}
            keywords = {topic["name"]: topic["keywords"] for topic in topics_and_keywords}

            return probs, keywords, topics, topics_and_keywords
        except Exception as e:
            print(f"Error in get_topics: {e}")
            return None, None, None, None
        
    def get_topics_test(self, doc):
        """
        Mock version of the get_topics method for testing without GPU.
        Generates hardcoded topics and keywords as a response.
        
        Parameters:
        - doc: Input document (not used in this test version).
        
        Returns:
        - probs: Hardcoded probabilities for each topic.
        - keywords: Hardcoded keywords for each topic.
        - topics: List of topic names.
        """
        # Example hardcoded response
        probs = {
            "Topic 1": 30.0,
            "Topic 2": 25.0,
            "Topic 3": 20.0,
            "Topic 4": 15.0,
            "Topic 5": 10.0
        }
        
        keywords = {
            "Topic 1": ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"],
            "Topic 2": ["keyword6", "keyword7", "keyword8", "keyword9", "keyword10"],
            "Topic 3": ["keyword11", "keyword12", "keyword13", "keyword14", "keyword15"],
            "Topic 4": ["keyword16", "keyword17", "keyword18", "keyword19", "keyword20"],
            "Topic 5": ["keyword21", "keyword22", "keyword23", "keyword24", "keyword25"]
        }
        
        topics = list(probs.keys())

        topics_and_keywords = [
            {"name": topic, "percentage": probs[topic], "keywords": keywords[topic]} for topic in topics
        ]
        
        return probs, keywords, topics, topics_and_keywords

    def check_topic_count(self, num_topics):
        """Ensure the number of topics matches the specified count."""
        if self.num_topics != num_topics:
            self.num_topics = num_topics
    

    ##Summarization Zone:
   
    def summarize_with_hint(self, text, topics_and_keywords):
        """Generate summaries for each topic using LLM."""
        summaries = []
        for topic in topics_and_keywords:
            topic_name = topic["name"]
            keywords = ', '.join(topic["keywords"][:5])  # Use top 5 keywords
            prompt = f"""
            Summarize the following text in 2-3 sentences, focusing on the provided topic and keywords.
            Hint: Topic - {topic_name}; Keywords - {keywords}.

            Ensure the summary is concise and captures the main points of the text without adding many details.
            If the text is very short, condense the summary into a single, clear, and well-structured sentence.
            Avoid repeating keywords unnecessarily or adding unrelated details.

            Text: {text}
            """
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "You are a summarization assistant."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7
                )
                summaries.append({
                    "topic": topic_name,
                    "summary": response.choices[0].message.content.strip()
                })
            except Exception as e:
                print(f"Error summarizing for topic {topic_name}: {e}")
        return summaries
    
    def summarize_with_hint_test(self, text, topics_and_keywords):
        """Mock version of summarize_with_hint for testing purposes."""
        summaries = []
        for topic in topics_and_keywords:
            topic_name = topic["name"]
            keywords = ', '.join(topic["keywords"][:5])  # Use top 5 keywords
            mock_summary = f"This is a mock summary for the topic '{topic_name}' with focus on keywords: {keywords}."
            summaries.append({"topic": topic_name, "summary": mock_summary})
        return summaries
    
    def generate_news_article(self, topic, summary, keywords, style):
        """Generate a long news article based on topic, summary, keywords, and style."""
        style_prompt = {
            "formal": "Use a professional and objective tone, suitable for a reputable news outlet.",
            "academic": "Write in an academic style, using analytical and precise language.",
            "gen_z": "Use a Gen Z tone, be witty, include pop culture references, and keep it conversational.",
            "narrative": "Write in a storytelling style, with vivid descriptions and engaging narrative techniques.",
            "persuasive": "Write persuasively, using emotional and motivational language.",
            "satirical": "Write in a satirical tone, with humor and irony to critique the subject.",
            "conversational": "Use a friendly and informal conversational tone.",
            "poetic": "Write in a poetic style, with metaphorical and rhythmic language.",
            "investigative": "Write in an investigative tone, presenting facts systematically and focusing on analysis."
        }

        tone_instructions = style_prompt[style]

        prompt = (
            f"You are a professional journalist writing for a major news outlet. Your goal is to craft a compelling and detailed news article.\n\n"
            f"**Topic**: {topic}\n\n"
            f"**Summary**: {summary}\n\n"
            f"**Key Points and Keywords**:\n- " + "\n- ".join(keywords) + "\n\n"
            f"**Style**: {tone_instructions}\n\n"
            f"**Requirements**:\n"
            f"1. Write a long, engaging news article (at least 800 words).\n"
            f"2. Include an attention-grabbing headline at the beginning.\n"
            f"3. Expand upon the provided summary using the listed keywords. Use them naturally throughout the article.\n"
            f"4. Include historical context, background, or analysis where relevant.\n"
            f"5. Use the specified style and tone throughout the article.\n\n"
            f"Start your response with the headline, followed by the full article."
        )

        print(f"Generating article for topic: {topic} in {style} style")
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a creative journalist."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.8
            )
            print(f"Article generated for topic: {topic}")
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error generating article for topic {topic}: {e}")
            return ""
       
    
    def generate_news_article_test(self, topic, summary, keywords, style):
        """Mock version of generate_news_article for testing purposes."""
        style_descriptions = {
            "formal": "a professional and objective tone, suitable for a reputable news outlet.",
            "academic": "an academic style, using analytical and precise language.",
            "gen_z": "a Gen Z tone, witty and conversational with pop culture references.",
            "narrative": "a storytelling style with vivid descriptions.",
            "persuasive": "a persuasive tone, using emotional and motivational language.",
            "satirical": "a satirical tone with humor and irony.",
            "conversational": "a friendly and informal conversational tone.",
            "poetic": "a poetic style, using metaphorical and rhythmic language.",
            "investigative": "an investigative tone, presenting facts systematically and focusing on analysis."
        }

        style_description = style_descriptions.get(style, "a general style")

        article = f"""
        **Headline**: Breaking News: {topic}

        In a remarkable development, {summary}

        This article explores the topic of "{topic}" with a focus on the following key points:
        - {', '.join(keywords)}

        Written in {style_description}, this piece dives into the intricacies of {topic}, providing insights and detailed analysis.
        """
        return article.strip()

### Claim Verification test zone

### Sentence segmentation and Claim decomposition

In [11]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def segment_article_into_sentences(article):
    return sent_tokenize(article)

[nltk_data] Downloading package punkt to /Users/rick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
def classify_sentences_as_claims(sentences, model="gpt-4", temperature=0.3):
    prompt = "Below is a list of sentences. For each one, answer YES if it is a factual claim that can be verified (true or false), and NO otherwise.\n\n"
    for i, sentence in enumerate(sentences):
        prompt += f"{i+1}. {sentence.strip()}\n"

    prompt += "\nAnswer as: 1. YES, 2. NO, ..."

    response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
            )

    #output = response['choices'][0]['message']['content']
    output = response.choices[0].message.content.strip()
    lines = output.strip().split("\n")
    claim_sentences = []
    for i, line in enumerate(lines):
        if "YES" in line.upper():
            claim_sentences.append(sentences[i])
    return claim_sentences


In [15]:
article="The recent advancements in artificial intelligence have been remarkable. OpenAI released GPT-4 in March 2023, marking a significant milestone in language model development. What does this mean for the future of human creativity? Machine learning algorithms now process over 2.5 quintillion bytes of data daily across the internet. How fascinating it would be if we could harness this computational power for solving climate change! The global AI market is projected to reach $1.8 trillion by 2030. Consider the ethical implications of such rapid technological growth. Google's headquarters are located in Mountain View, California, where the company employs approximately 170,000 people worldwide. Imagine a world where artificial intelligence could predict natural disasters with 99% accuracy."

In [16]:
# Step 1: Segment
sentences = segment_article_into_sentences(article)

In [17]:
sentences

['The recent advancements in artificial intelligence have been remarkable.',
 'OpenAI released GPT-4 in March 2023, marking a significant milestone in language model development.',
 'What does this mean for the future of human creativity?',
 'Machine learning algorithms now process over 2.5 quintillion bytes of data daily across the internet.',
 'How fascinating it would be if we could harness this computational power for solving climate change!',
 'The global AI market is projected to reach $1.8 trillion by 2030.',
 'Consider the ethical implications of such rapid technological growth.',
 "Google's headquarters are located in Mountain View, California, where the company employs approximately 170,000 people worldwide.",
 'Imagine a world where artificial intelligence could predict natural disasters with 99% accuracy.']

In [26]:
# Step 2: Classify which ones are claims
claims = classify_sentences_as_claims(sentences)

In [27]:
claims

['OpenAI released GPT-4 in March 2023, marking a significant milestone in language model development.',
 'Machine learning algorithms now process over 2.5 quintillion bytes of data daily across the internet.',
 'The global AI market is projected to reach $1.8 trillion by 2030.',
 "Google's headquarters are located in Mountain View, California, where the company employs approximately 170,000 people worldwide."]

In [42]:
def decompose_claim(claim: str):
    prompt = f"""
            You are a fact-checking assistant. Generate 5 yes or no questions to help me answer if the given claim is true or false.

Claim: "{claim}"

Subquestions:
1.
"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.7,
        stop=None
    )
    questions = response.choices[0].message.content.strip().split("\n")
    return [q.strip("0123456789. ").strip() for q in questions if q.strip()]


In [29]:
claim=claims[0]

In [43]:
subqs = decompose_claim(claim)

In [44]:
subqs

['Did OpenAI actually release GPT-4 in March 2023?',
 'Is GPT-4 a language model developed by OpenAI?',
 'Was the release of GPT-4 considered a significant milestone in language model development?',
 'Was there any official announcement or documentation from OpenAI about the release of GPT-4 in March 2023?',
 'Was there any significant technological advancement or features introduced in GPT-4 compared to its predecessors?']

### First Stage Web Retrieval

In [87]:
from serpapi.google_search import GoogleSearch
from readability.readability import Document
import requests
from bs4 import BeautifulSoup

In [107]:
import requests
from readability.readability import Document
from bs4 import BeautifulSoup

def extract_readable_text(url):
    try:
        session = requests.Session()
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/115.0 Safari/537.36"
            )
        }
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 403:
            print(f"[403] Forbidden (likely anti-bot): {url}")
            return None
        if response.status_code != 200:
            print(f"[{response.status_code}] Cannot access: {url}")
            return None

        html = response.text
        doc = Document(html)
        summary_html = doc.summary()
        soup = BeautifulSoup(summary_html, "html.parser")
        text = soup.get_text(separator=" ", strip=True)

        if len(text.split()) < 30:
            print(f"[Skipped] Too short: {url}")
            return None

        return text

    except Exception as e:
        print(f"[Error] {url} -> {str(e)}")
        return None


In [108]:
from newspaper import Article

def extract_with_newspaper(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text
        if len(text.split()) < 30:
            print(f"[Skipped] Too short: {url}")
            return None
        return text
    except Exception as e:
        print(f"[Error] {url} -> {e}")
        return None

In [109]:
def extract_readable_text_combined(url):
    text = extract_readable_text(url)
    if text:
        return text
    return extract_with_newspaper(url)

In [None]:
c6e679102928c9c981d382b4adbc7af8bde06aca989e730c2f9f75cab6545e26

In [97]:
import requests

def google_custom_search(query, api_key, cx, num_results=5):
    search_url = "https://www.googleapis.com/customsearch/v1"
    search_url = "https://serpapi.com/search"
    params = {
        "key": api_key,
        "cx": cx,
        "q": query,
        "num": num_results
    }

    response = requests.get(search_url, params=params)
    response.raise_for_status()
    results = response.json()

    links = []
    for item in results.get("items", []):
        links.append({
            "title": item.get("title"),
            "link": item.get("link"),
            "snippet": item.get("snippet")
        })
    return links


### Second stage web retrieval

In [94]:
from rank_bm25 import BM25Okapi

In [95]:
def bm25_rerank(query, documents, k1=30, k2=150, top_k=4):
    chunks = []
    for doc in documents:
        words = nltk.word_tokenize(doc)
        for i in range(0, len(words), k1 // 2):
            chunk = words[i:i+k1]
            chunks.append(" ".join(chunk))

    tokenized_chunks = [nltk.word_tokenize(c.lower()) for c in chunks]
    bm25 = BM25Okapi(tokenized_chunks)
    scores = bm25.get_scores(nltk.word_tokenize(query.lower()))
    top_chunks = [chunks[i] for i in sorted(range(len(scores)), key=lambda i: -scores[i])[:top_k]]
    return top_chunks

In [115]:
documents = []
api_key = "AIzaSyBQNt7wJLePi6oWGGRe07i0oG0VkF_10lc"
cx = "c58bf6758fa324380"
for subq in subqs:
    urls = google_custom_search(subq, api_key, cx)
    #print(f"\nSubquestion: {subq}")
    for result in google_custom_search(subq, api_key, cx):
        url = result["link"]
        text = extract_readable_text_combined(url)
        if text:
            documents.append(text)

print(f"# documents: {len(documents)}")
for i, doc in enumerate(documents):
    print(f"[Doc {i}] {len(doc.split())} words")


Subquestion: Did OpenAI actually release GPT-4 in March 2023?
[403] Forbidden (likely anti-bot): https://help.openai.com/en/articles/6825453-chatgpt-release-notes
[Error] https://help.openai.com/en/articles/6825453-chatgpt-release-notes -> Article `download()` failed with 403 Client Error: Forbidden for url: https://help.openai.com/en/articles/6825453-chatgpt-release-notes on URL https://help.openai.com/en/articles/6825453-chatgpt-release-notes
[403] Forbidden (likely anti-bot): https://openai.com/index/gpt-4-research/
[Error] https://openai.com/index/gpt-4-research/ -> Article `download()` failed with 403 Client Error: Forbidden for url: https://openai.com/index/gpt-4-research/ on URL https://openai.com/index/gpt-4-research/

Subquestion: Is GPT-4 a language model developed by OpenAI?
[403] Forbidden (likely anti-bot): https://platform.openai.com/docs/models
[Error] https://platform.openai.com/docs/models -> Article `download()` failed with 403 Client Error: Forbidden for url: https:

In [117]:
top_chunks = []
for subq in subqs:
    top_chunks.extend(bm25_rerank(subq, documents))

In [118]:
top_chunks

['reading : How to Write ChatGPT Prompts : Your Guide Generative AI grows 2023 was a milestone year in terms of generative AI . Not only did OpenAI release GPT-4',
 'direct access to the internet for retrieving real-time information or updates . 5 . How Did GPT-4 Address the Challenges of GPT-3 ? GPT-4 : Introduced in 2023 , GPT-4',
 'Did GPT-4 Address the Challenges of GPT-3 ? GPT-4 : Introduced in 2023 , GPT-4 aimed to refine and enhance the capabilities of GPT-3 . It focused on improving accuracy',
 'a milestone year in terms of generative AI . Not only did OpenAI release GPT-4 , which again built on its predecessorâ\x80\x99s power , but Microsoft integrated ChatGPT into its',
 'ChatGPT , based on the GPT-4 architecture , which is a language model developed by OpenAI . These models learn patterns in human language and generate responses based on those',
 'language model developed by OpenAI , following GPT-4 , GPT-3.5 , and GPT-3 releases . OpenAI boasts that GPT-4o is 50 % cheaper t

In [125]:
evidence_context = "\n\n".join(top_chunks)

### Claim Summary

In [120]:
def summarize_claim_focused(claim: str, evidence: str):
    prompt = f"""
Suppose you are assisting a fact-checker to fact-check the claim.

Claim: "{claim}"

Document:
\"\"\"
{evidence}
\"\"\"

Summarize the relevant information from the document in 1-2
sentences. Your response should provide a clear and concise
summary of the relevant information contained in the document.
Do not include a judgment about the claim and do not repeat any
information from the claim that is not supported by the
document.
Summarization:

"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()


In [126]:
final_summary = summarize_claim_focused(claim, evidence_context)

In [129]:
final_summary

'The document confirms that OpenAI released GPT-4 in the year 2023. However, it does not specify the month of the release as March 2023.'

In [131]:
def gpt4_classify_veracity(claim, evidence_summary):
    prompt = f"""
You are a professional fact-checking assistant. Your task is to read a factual claim and a summary of supporting or opposing evidence, and classify the claim's truthfulness into one of the following 6 categories:

- true
- mostly true
- half true
- barely true
- false
- pants-on-fire

### Guidelines:
- "true": The evidence clearly confirms all factual aspects of the claim.
- "mostly true": The evidence confirms most aspects, with minor issues or missing context.
- "half true": The evidence is mixed, with significant confirmations and contradictions.
- "barely true": Only a small part of the claim is supported by evidence.
- "false": The claim is directly contradicted by the evidence.
- "pants-on-fire": The claim is not only false but wildly inaccurate or fabricated.

### Examples:

**Claim**: The U.S. has the highest number of gun deaths in the world.
**Evidence**: The U.S. ranks 32nd in gun deaths per capita globally. Countries like El Salvador, Venezuela, and Honduras have much higher rates. However, the U.S. does lead in total number of gun deaths due to population size.
**Classification**: half true

**Claim**: The moon landing was faked and filmed in a studio.
**Evidence**: There is overwhelming scientific, photographic, and third-party evidence confirming the Apollo 11 landing in 1969. The conspiracy theory lacks credible support.
**Classification**: pants-on-fire

---

Now evaluate this case:

**Claim**: {claim}

**Evidence**:
{evidence_summary}

**Classification**:"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=20
    )

    label = response.choices[0].message.content.strip().lower()
    return label


In [133]:
label = gpt4_classify_veracity(claim, final_summary)

In [134]:
label

'mostly true'