# Libraries and models


In [1]:
!pip install nltk



In [2]:
import requests
from bs4 import BeautifulSoup
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [3]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Functions



*   save_story(method, title, story)
*   get_did_you_know()
*   convert_to_base(word)
*   lemmatize_sentence(sentence)
*   get_sentence_theme(sentence)
*   calculate_theme_scores(theme)
*   select_fact(facts_of_the_day)

In [4]:
def extract_content(string):
    pattern = r"'([^']*)'"
    matches = re.findall(pattern, string)
    return matches

In [5]:
def save_story(method, title, story):
    filename = f"{method}_{title}.txt"
    with open(filename, "w") as file:
        file.write(title + "\n")
        file.write(story)
    print(f"Story saved to {filename}")

In [6]:
def get_did_you_know():
    url = "https://en.wikipedia.org/wiki/Main_Page"
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    dyk_section = soup.find('div', id='mp-dyk')

    # Find all list elements
    list_elements = dyk_section.find_all('li')

    # Extract the text from each list element and store it in a list
    did_you_know = [li.text for li in list_elements]


    to_drop =['Archive','Start a new article','Nominate an article']

    for i in to_drop:
        did_you_know.remove(i)

    did_you_know = [sentence.replace('(pictured)', '') for sentence in did_you_know]

    prefix = "... that "
    did_you_know = [sentence[len(prefix):].rstrip('?').capitalize() if sentence.startswith(prefix) else sentence for sentence in did_you_know]

    return did_you_know

In [7]:
# get_did_you_know() function, scrapes the data from "Did you know" section of wikipedia, cleans it and returns the did you know facts as a list.
# Call the function and print the result
facts_of_the_day = get_did_you_know()
for num, fact in enumerate(facts_of_the_day, 1):
    print(f"{num}. {fact}")

1. 20,000 species of bees, the first feature film by estibaliz urresola solaguren , won numerous awards at film festivals
2. When discussing her album guts, olivia rodrigo said she "grew 10 years between the ages of 18 and 20"
3. Ameles decolor has one of the most complex mating rituals of any praying mantis
4. The duke of westminster preferred bourdon house to grosvenor house
5. Former kuwaiti foreign minister ahmad nasser al-mohammad al-sabah interrupted his studies in france to volunteer during the iraqi invasion of kuwait
6. Almost 40 years into their career, american indie rock band yo la tengo released their first self-produced album, this stupid world, in 2023
7. Ferrial sofyan was absent at the inauguration of his successor as deputy speaker of the jakarta regional people's representative council
8. Oklahoma tenderloin is baloney


In [8]:
def convert_to_base(word):
    lemmatizer = WordNetLemmatizer()
    base_word = lemmatizer.lemmatize(word)
    return base_word

def lemmatize_sentence(sentence):
    words = word_tokenize(sentence)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_sentence = ' '.join(lemmatized_words)
    return lemmatized_sentence

In [9]:
def get_sentence_theme(sentence):
    # Theme keywords/cues
    education_keywords = ["institute", "college", "published", "school", "learning", "students", "curriculum", "knowledge", "teaching", "classroom", "education", "academic", "study", "pedagogy", "policy", "technology", "online", "examination", "research", "scholarship"]
    sports_keywords = ["player", "sports", "match", "team", "game", "championship", "athlete", "competition", "sportsmanship", "training", "fitness", "industry", "psychology", "medicine", "analytics", "nutrition", "coach", "stadium", "sportsman"]
    technology_keywords = ["innovation", "tech", "technology", "digital", "device", "software", "internet", "computer", "tech", "electronics", "intelligence", "automation", "cybersecurity", "learning", "science", "cloud", "blockchain", "algorithm", "coding", "programming"]
    music_keywords = ["song", "pop", "rock", "hip", "hop", "rap", "r&b", "soul", "country", "jazz", "electronic", "classical", "reggae", "metal", "music", "artist", "album", "melody", "performance", "genre", "musician", "concert", "composition", "industry", "production", "theory", "therapy", "festivals", "streaming", "lyrics", "soundtrack", "instrument", "harmony"]
    history_keywords = ["event", "history", "era", "historical", "timeline", "ancient", "civilization", "war", "monarchy", "revolution", "archaeology", "historian", "figures", "documents", "research", "oral", "heritage", "culture", "anthropology", "museum"]
    politics_keywords = ["government", "leader", "politics", "political", "strike", "diplomatic", "diplomacy", "ambassador", "governor", "policy", "election", "democracy", "legislation", "leader", "party", "administration", "governance", "activism", "campaign", "citizen", "voting"]
    science_keywords = ["research", "science", "experiment", "discovery", "theory", "scientific", "laboratory", "scientist", "innovation", "technology", "method", "physics", "chemistry", "biology", "astronomy", "neuroscience", "mathematics", "data", "analysis", "hypothesis"]
    nature_keywords = ["environment", "nature", "wildlife", "ecosystem", "natural", "conservation", "biodiversity", "ecology", "climate", "reserve", "sustainability", "services", "ethics", "activism", "tourism", "ecosystems", "habitat", "species", "ocean", "forest"]
    food_keywords = ["cuisine", "food", "recipe", "cooking", "ingredient", "taste", "culinary", "gastronomy", "culture", "nutrition", "industry", "trends", "blogging", "sustainability", "science", "arts", "restaurant", "chef", "flavors", "beverages"]
    travel_keywords = ["destination", "travel", "explore", "adventure", "journey", "tourism", "vacation", "wanderlust", "agency", "sightseeing", "experience", "backpacking", "ecotourism", "sustainable", "photography", "immersion", "passport", "culture", "explorer", "hospitality"]
    art_keywords = ["painting", "art", "sculpture", "masterpiece", "creativity", "exhibition", "artist", "artistic", "visual", "gallery", "movement", "history", "critique", "therapy", "contemporary", "public", "design", "expression", "aesthetics", "canvas"]
    literature_keywords = ["novel", "literature", "poetry", "author", "literary", "book", "storytelling", "fiction", "classics", "reading", "analysis", "theory", "publishing", "creative", "criticism", "poetic", "prose", "literature", "library", "narrative"]
    health_keywords = ["wellness", "health", "fitness", "medical", "wellbeing", "nutrition", "healthcare", "lifestyle", "mental", "education", "public", "holistic", "alternative", "preventive", "technology", "telemedicine", "therapy", "medication", "wellness", "disease"]
    business_keywords = ["company", "ceo", "business", "entrepreneurship", "finance", "economy", "market", "investment", "startup", "strategy", "entrepreneur", "development", "ethics", "financial", "analytics", "supply", "e-commerce", "management", "leadership", "organization", "entrepreneurial"]
    celebrities_keywords = ["actor", "actress", "fame", "celebrity", "entertainment", "star", "carpet", "paparazzi", "culture", "showbiz", "gossip", "endorsements", "news", "interviews", "industry", "stardom", "redcarpet", "fashion", "film"]
    film_keywords = ["movie", "film", "director", "cinema", "screenplay", "cinematography", "blockbuster", "industry", "festival", "production", "analysis", "theory", "genres", "documentary", "effects", "actor", "script", "cinematic", "filmography"]
    environment_keywords = ["sustainability", "environment", "climate", "conservation", "ecosystem", "green", "renewable", "activism", "footprint", "policy", "awareness", "energy", "management", "education", "adaptation", "restoration", "pollution", "global", "warming", "ecological"]
    fashion_keywords = ["style", "trend", "fashion", "designer", "runway", "clothing", "attire", "industry", "brand", "show", "photography", "blogging", "marketing", "sustainable", "couture", "retail", "fashion", "accessories", "textile", "design"]
    culture_keywords = ["tradition", "culture", "heritage", "customs", "society", "cultural", "diversity", "identity", "exchange", "studies", "anthropology", "multiculturalism", "cultural", "festivals", "assimilation", "language", "cultural", "expression", "rituals"]
    crime_keywords = ["investigation", "appeals", "conviction", "convict", "court", "crime", "arrested", "criminal", "law", "justice", "detective", "offender", "forensics", "scene", "system", "prevention", "profiling", "cybercrime", "organized", "white-collar", "juvenile", "punishment", "victim", "crime", "police"]
    space_keywords = ["universe", "space", "astronomy", "exploration", "celestial", "planet", "astronaut", "travel", "technology", "cosmology", "missions", "astrophysics", "probes", "colonization", "rocketry", "observatories", "galaxy", "spacecraft", "space", "astronomical"]
    architecture_keywords = ["building", "architecture", "design", "structure", "urban", "construction", "landmark", "architect", "style", "planning", "history", "sustainable", "theory", "landscape", "preservation", "drawings", "skyscraper", "architecture", "interior", "facade"]
    religion_keywords = ["faith", "religion", "hajj", "pilgrim", "pilgrims", "pope", "imaam", "christianity", "islam", "belief", "spirituality", "worship", "divine", "theology", "practices", "texts", "interfaith", "awakening", "rituals", "leaders", "communities", "religion", "sacred", "ritual"]
    philosophy_keywords = ["ethics", "philosophy", "morality", "reasoning", "metaphysics", "existential", "thought", "philosopher", "theories", "epistemology", "logic", "debates", "mind", "moral", "political", "science", "philosophy", "knowledge", "wisdom", "rationality"]
    social_issues_keywords = ["inequality", "social", "poverty", "discrimination", "activism", "justice", "rights", "human", "development", "welfare", "gender", "racial", "movements", "environmental", "empowerment", "equality", "social", "societal", "inequity", "change"]
    economy_keywords = ["finance", "economy", "market", "trade", "economic", "growth", "recession", "macroeconomics", "microeconomics", "policies", "indicators", "inequality", "development", "trade", "cycles", "fiscal", "entrepreneurship", "commerce", "business", "consumption"]
    medical_keywords = ["health", "medical", "illness", "treatment", "doctor", "patient", "medicine", "diagnosis", "surgery", "vaccination", "healthcare", "pharmaceuticals", "research", "technology", "public", "ethics", "imaging", "insurance", "pandemic", "therapy"]
    conflict_keywords = ["conflict", "war", "wars", "battle", "military", "armies", "soldiers", "warfare", "strategy", "tactics", "weapons", "casualties", "violence", "siege", "peacekeeping", "defense", "offensive", "civil", "crimes", "nuclear", "guerrilla", "veterans", "effort", "zone", "memorial", "correspondent", "photography"]


    # Theme scores initialization
    education_score = 0
    sports_score = 0
    technology_score = 0
    music_score = 0
    history_score = 0
    politics_score = 0
    science_score = 0
    nature_score = 0
    food_score = 0
    travel_score = 0
    art_score = 0
    literature_score = 0
    health_score = 0
    business_score = 0
    celebrities_score = 0
    film_score = 0
    environment_score = 0
    fashion_score = 0
    culture_score = 0
    crime_score = 0
    space_score = 0
    architecture_score = 0
    religion_score = 0
    philosophy_score = 0
    social_issues_score = 0
    economy_score = 0
    unknown_score = 0
    medical_score = 0
    conflict_score = 0

    # Calculate theme scores based on keyword matches
    for word in sentence.lower().split():
        if word in education_keywords:
            education_score += 1
        if word in sports_keywords:
            sports_score += 1
        if word in technology_keywords:
            technology_score += 1
        if word in music_keywords:
            music_score += 1
        if word in history_keywords:
            history_score += 1
        if word in politics_keywords:
            politics_score += 1
        if word in science_keywords:
            science_score += 1
        if word in nature_keywords:
            nature_score += 1
        if word in food_keywords:
            food_score += 1
        if word in travel_keywords:
            travel_score += 1
        if word in art_keywords:
            art_score += 1
        if word in literature_keywords:
            literature_score += 1
        if word in health_keywords:
            health_score += 1
        if word in business_keywords:
            business_score += 1
        if word in celebrities_keywords:
            celebrities_score += 1
        if word in film_keywords:
            film_score += 1
        if word in environment_keywords:
            environment_score += 1
        if word in fashion_keywords:
            fashion_score += 1
        if word in culture_keywords:
            culture_score += 1
        if word in crime_keywords:
            crime_score += 1
        if word in space_keywords:
            space_score += 1
        if word in architecture_keywords:
            architecture_score += 1
        if word in religion_keywords:
            religion_score += 1
        if word in philosophy_keywords:
            philosophy_score += 1
        if word in social_issues_keywords:
            social_issues_score += 1
        if word in economy_keywords:
            economy_score += 1
        if word in medical_keywords:
            medical_score +=1
        if word in conflict_keywords:
            conflict_score +=1

    # Create a dictionary of theme scores
    theme_scores = {
        "unknown":unknown_score,
        "education": education_score,
        "sports": sports_score,
        "technology": technology_score,
        "music": music_score,
        "history": history_score,
        "politics": politics_score,
        "science": science_score,
        "nature": nature_score,
        "food": food_score,
        "travel": travel_score,
        "art": art_score,
        "literature": literature_score,
        "health": health_score,
        "business": business_score,
        "celebrities": celebrities_score,
        "film": film_score,
        "environment": environment_score,
        "fashion": fashion_score,
        "culture": culture_score,
        "crime": crime_score,
        "space": space_score,
        "architecture": architecture_score,
        "religion": religion_score,
        "philosophy": philosophy_score,
        "social_issues": social_issues_score,
        "economy": economy_score,
        "medical": medical_score,
        "conflict": conflict_score,
    }

    max_key = max(theme_scores, key=theme_scores.get)
    return max_key

In [10]:
def calculate_theme_scores(theme):
    theme_score_mapping = {
    "unknown":2,
    "education": 7,
    "sports": 6.1,
    "technology": 8,
    "music": 7.2,
    "history": 6.3,
    "politics": 5.4,
    "science": 7.5,
    "nature": 6.6,
    "food": 6.7,
    "travel": 6.8,
    "art": 7.9,
    "literature": 7.3,
    "health": 6.4,
    "business": 7.6,
    "celebrities": 6.5,
    "film": 7.7,
    "environment": 6.9,
    "fashion": 6.2,
    "culture": 7.8,
    "crime": 5.1,
    "space": 8.1,
    "architecture": 7.4,
    "religion": 6,
    "philosophy": 7.1,
    "social_issues": 6.5,
    "economy": 7.7,
    "medical": 8.2,
    "conflict": 7.3
    }

    return theme_score_mapping[theme]

In [11]:
def select_fact(facts_of_the_day):
  scores = []
  themes = []
  punctuation_marks = string.punctuation

  for fact in facts_of_the_day:
    fact_sentence = fact
    fact_sentence = "".join(char for char in fact_sentence if char not in punctuation_marks)
    fact_theme = get_sentence_theme(fact_sentence)

    # If we get an unknown theme, we lemmatize the sentence and check it again incase the base word matches the keywords.
    if fact_theme  == 'unknown':
      fact_sentence = lemmatize_sentence(fact_sentence)
      fact_theme = get_sentence_theme(fact_sentence)

    themes.append(fact_theme)
    scores.append(calculate_theme_scores(fact_theme))

  max_score = max(scores)
  max_index = scores.index(max_score)

  #return the index of the highest score fact
  return max_index, scores, themes

# Theme Conclusion



In [12]:
fun_index , fun_scores, fun_theme = select_fact(facts_of_the_day)
print("Index: ",fun_index)
punctuation_marks = string.punctuation
print("Predicted Theme: ",get_sentence_theme("".join(char for char in facts_of_the_day[fun_index] if char not in punctuation_marks)))

Index:  2
Predicted Theme:  culture


In [13]:
for i in range(0,len(fun_theme)):
  print(fun_theme[i] , " : ",fun_scores[i])

celebrities  :  6.5
music  :  7.2
culture  :  7.8
unknown  :  2
culture  :  7.8
music  :  7.2
unknown  :  2
unknown  :  2


In [14]:
did_you_know_facts = get_did_you_know()
main_index, main_scores, main_themes = select_fact(did_you_know_facts)

print("Fact:\n")
print(did_you_know_facts[main_index])
print("\nGuessed Theme: ",main_themes[main_index])
print("Assigned Score: ",main_scores[main_index])

Fact:

Ameles decolor has one of the most complex mating rituals of any praying mantis

Guessed Theme:  culture
Assigned Score:  7.8
