In [12]:
from collections import Counter
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Setup
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

LLMprompt = "Determine whether these news articles are real or fake based off of the title and text. Do not look any information up, decide purely based on the content of the articles. Analyze the article structure, tone, word choice, professionalism, topic, and any other features that would differ between fake and real news. Justify your answer, explaining which features informed your decision."

def analyze_article(article):
    """
    Takes a dict with keys: 'title', 'text', 'subject', 'date'
    Outputs article stats and a complete LLM prompt.
    """
    text = article['text']
    doc = nlp(text)

    # Get stats
    word_count = len(text.split())
    char_count = len(text)
    entities = [ent.label_ for ent in doc.ents]
    entity_counts = Counter(entities)

    # Get frequent words
    tokens = word_tokenize(text.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    word_freq = Counter(filtered).most_common(10)

    # Print stats
    print("📄 Article Stats")
    print(f"Title: {article['title']}")
    print(f"Date: {article['date']} | Subject: {article['subject']}")
    print(f"Word Count: {word_count}")
    print(f"Character Count: {char_count}")

    print("\nNamed Entities:")
    for k, v in entity_counts.items():
        print(f"  {k}: {v}")

    print("\nTop 10 Most Frequent Words:")
    for word, freq in word_freq:
        print(f"  {word}: {freq}")


    print()
    # Create prompt to feed to LLM
    llm_prompt = f"""{LLMprompt}

Title: {article['title']}
Text: {article['text']}
"""


    print(llm_prompt)

    # ==================== API call to ChatGPT to generate response (requires valid API key) ====================

    # import openai

    # openai.api_key = "ENTER_API_KEY"

    # response = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "user", "content": llm_prompt}
    #     ]
    # )

    # print("\n📝 LLM Response:\n")
    # print(response['choices'][0]['message']['content'])


example_article = { "title": "Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)",
                   "text": "On Christmas day, Donald Trump announced that he would  be back to work  the following day, but he is golfing for the fourth day in a row. The former reality show star blasted former President Barack Obama for playing golf and now Trump is on track to outpace the number of golf games his predecessor played.Updated my tracker of Trump s appearances at Trump properties.71 rounds of golf including today s. At this pace, he ll pass Obama s first-term total by July 24 next year. https://t.co/Fg7VacxRtJ pic.twitter.com/5gEMcjQTbH  Philip Bump (@pbump) December 29, 2017 That makes what a Washington Post reporter discovered on Trump s website really weird, but everything about this administration is bizarre AF. The coding contained a reference to Obama and golf:  Unlike Obama, we are working to fix the problem   and not on the golf course.  However, the coding wasn t done correctly.The website of Donald Trump, who has spent several days in a row at the golf course, is coded to serve up the following message in the event of an internal server error: https://t.co/zrWpyMXRcz pic.twitter.com/wiQSQNNzw0  Christopher Ingraham (@_cingraham) December 28, 2017That snippet of code appears to be on all https://t.co/dkhw0AlHB4 pages, which the footer says is paid for by the RNC? pic.twitter.com/oaZDT126B3  Christopher Ingraham (@_cingraham) December 28, 2017It s also all over https://t.co/ayBlGmk65Z. As others have noted in this thread, this is weird code and it s not clear it would ever actually display, but who knows.  Christopher Ingraham (@_cingraham) December 28, 2017After the coding was called out, the reference to Obama was deleted.UPDATE: The golf error message has been removed from the Trump and GOP websites. They also fixed the javascript  =  vs  ==  problem. Still not clear when these messages would actually display, since the actual 404 (and presumably 500) page displays a different message pic.twitter.com/Z7dmyQ5smy  Christopher Ingraham (@_cingraham) December 29, 2017That suggests someone at either RNC or the Trump admin is sensitive enough to Trump s golf problem to make this issue go away quickly once people noticed. You have no idea how much I d love to see the email exchange that led us here.  Christopher Ingraham (@_cingraham) December 29, 2017 The code was f-cked up.The best part about this is that they are using the  =  (assignment) operator which means that bit of code will never get run. If you look a few lines up  errorCode  will always be  404          (@tw1trsux) December 28, 2017trump s coders can t code. Nobody is surprised.  Tim Peterson (@timrpeterson) December 28, 2017Donald Trump is obsessed with Obama that his name was even in the coding of his website while he played golf again.Photo by Joe Raedle/Getty Images.",
                   "subject": "News",
                   "date": "December 29, 2017"}

analyze_article(example_article)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


📄 Article Stats
Title: Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)
Date: December 29, 2017 | Subject: News
Word Count: 444
Character Count: 2774

Named Entities:
  DATE: 14
  PERSON: 16
  ORG: 17
  ORDINAL: 1
  CARDINAL: 5
  GPE: 5

Top 10 Most Frequent Words:
  trump: 10
  golf: 9
  december: 8
  obama: 6
  christopher: 5
  ingraham: 5
  code: 5
  https: 4
  coding: 4
  day: 3

Determine whether these news articles are real or fake based off of the title and text. Do not look any information up, decide purely based on the content of the articles. Analyze the article structure, tone, word choice, professionalism, topic, and any other features that would differ between fake and real news. Justify your answer, explaining which features informed your decision.

Title: Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)
Text: On Christmas day, Donald Trump announced that he would  be back to work  the following day, but he is golf