<a href="https://colab.research.google.com/github/Kepners/ChopOnions/blob/main/Trndzo1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# Block 1: Install Required Libraries
!pip install openai==0.27.8
!pip install python-dotenv
!pip install praw
!pip install requests
!pip install cachetools
!pip install nltk
!pip install tqdm
!pip install numpy
!pip install scikit-learn
!pip install feedparser
!pip install google-search-results  # SerpAPI client
!pip install pytrends  # Alternative to SerpAPI for Google Trends




In [36]:
# Block 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
# Block 3: Load Environment Variables
import os
from dotenv import load_dotenv
import logging

# Define the path to your .env file in Google Drive
dotenv_path = '/content/drive/MyDrive/Secrets/.env'  # Updated with capital 'S'

# Load the environment variables from the .env file
load_dotenv(dotenv_path)

# Retrieve API keys from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PEXELS_API_KEY = os.getenv('PEXELS_API_KEY')
REDDIT_CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
SHUTTERSTOCK_ACCESS_TOKEN = os.getenv('SHUTTERSTOCK_ACCESS_TOKEN')

# Validate that all required environment variables are set
required_vars = [
    'OPENAI_API_KEY',
    'PEXELS_API_KEY',
    'REDDIT_CLIENT_ID',
    'REDDIT_CLIENT_SECRET',
    'SHUTTERSTOCK_ACCESS_TOKEN'
]
missing_vars = [var for var in required_vars if not os.getenv(var)]

if missing_vars:
    logging.error(f"Missing environment variables: {', '.join(missing_vars)}")
    raise SystemExit("Please ensure all API keys are set in the .env file.")
else:
    logging.info("All environment variables loaded successfully.")

# Block 4: Verify Environment Variables
for var in required_vars:
    if os.getenv(var):
        print(f"{var}: Loaded")
    else:
        print(f"{var}: Not Loaded")


OPENAI_API_KEY: Loaded
PEXELS_API_KEY: Loaded
REDDIT_CLIENT_ID: Loaded
REDDIT_CLIENT_SECRET: Loaded
SHUTTERSTOCK_ACCESS_TOKEN: Loaded


In [38]:
# Block 5: Download NLTK Data (Suppressing Output)
import nltk
import sys
from contextlib import contextmanager

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

with suppress_stdout():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [39]:
# Block 6A: Create Utils Directory
import os

utils_dir = 'utils'
if not os.path.exists(utils_dir):
    os.makedirs(utils_dir)


In [40]:
# Block 6B: Create data_processing.py
data_processing_code = """
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def extract_keywords(text, max_keywords=10):
    \"""
    Extracts up to `max_keywords` nouns from the input `text`.
    \"""
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]
    # Get part-of-speech tags
    tagged_words = pos_tag(filtered_words)
    # Keep nouns and proper nouns
    keywords = [word for word, pos in tagged_words if pos.startswith('NN')]
    # Limit the number of keywords
    return keywords[:max_keywords]
"""

with open(os.path.join('utils', 'data_processing.py'), 'w') as file:
    file.write(data_processing_code)


In [41]:
# Block 7: Initialize API Clients and Configure Logging
import praw
import openai
import logging
import warnings
from utils.data_processing import extract_keywords
from cachetools import TTLCache, cached
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from pytrends.request import TrendReq
import feedparser
import requests
import time
import os

# Configure Logging to write to a log file and suppress console output
log_file = 'app.log'
logging.basicConfig(
    level=logging.ERROR,  # Only log ERROR and CRITICAL
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.NullHandler()  # Suppress console output
    ]
)

# Suppress PRAW warnings about asynchronous environments
warnings.filterwarnings("ignore", category=UserWarning, module='praw')
logging.getLogger('praw').setLevel(logging.CRITICAL)

# Define 'add_separator' to do nothing
def add_separator():
    pass

# Define a list of 20 English, French, Spanish, Italian speaking countries
top_countries = {
    1: {'name': 'India', 'code': 'IN'},
    2: {'name': 'United States', 'code': 'US'},
    3: {'name': 'Nigeria', 'code': 'NG'},
    4: {'name': 'Mexico', 'code': 'MX'},
    5: {'name': 'Philippines', 'code': 'PH'},
    6: {'name': 'United Kingdom', 'code': 'GB'},
    7: {'name': 'France', 'code': 'FR'},
    8: {'name': 'Italy', 'code': 'IT'},
    9: {'name': 'South Africa', 'code': 'ZA'},
    10: {'name': 'Spain', 'code': 'ES'},
    11: {'name': 'Canada', 'code': 'CA'},
    12: {'name': 'Australia', 'code': 'AU'},
    13: {'name': 'Germany', 'code': 'DE'},
    14: {'name': 'Brazil', 'code': 'BR'},
    15: {'name': 'Argentina', 'code': 'AR'},
    16: {'name': 'Sweden', 'code': 'SE'},
    17: {'name': 'Netherlands', 'code': 'NL'},
    18: {'name': 'Belgium', 'code': 'BE'},
    19: {'name': 'Switzerland', 'code': 'CH'},
    20: {'name': 'Austria', 'code': 'AT'}
}

# Mapping from country codes to PyTrends 'pn' (payload name)
country_code_to_pn = {
    'IN': 'india',
    'US': 'united_states',
    'NG': 'nigeria',
    'MX': 'mexico',
    'PH': 'philippines',
    'GB': 'united_kingdom',
    'FR': 'france',
    'IT': 'italy',
    'ZA': 'south_africa',
    'ES': 'spain',
    'CA': 'canada',
    'AU': 'australia',
    'DE': 'germany',
    'BR': 'brazil',
    'AR': 'argentina',
    'SE': 'sweden',
    'NL': 'netherlands',
    'BE': 'belgium',
    'CH': 'switzerland',
    'AT': 'austria'
}

# Mapping from country codes to Google News RSS feed URLs
country_code_to_rss = {
    'IN': 'https://news.google.com/rss?hl=en-IN&gl=IN&ceid=IN:en',
    'US': 'https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en',
    'NG': 'https://news.google.com/rss?hl=en-NG&gl=NG&ceid=NG:en',
    'MX': 'https://news.google.com/rss?hl=en-MX&gl=MX&ceid=MX:en',
    'PH': 'https://news.google.com/rss?hl=en-PH&gl=PH&ceid=PH:en',
    'GB': 'https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en',
    'FR': 'https://news.google.com/rss?hl=en-FR&gl=FR&ceid=FR:en',
    'IT': 'https://news.google.com/rss?hl=en-IT&gl=IT&ceid=IT:en',
    'ZA': 'https://news.google.com/rss?hl=en-ZA&gl=ZA&ceid=ZA:en',
    'ES': 'https://news.google.com/rss?hl=en-ES&gl=ES&ceid=ES:en',
    'CA': 'https://news.google.com/rss?hl=en-CA&gl=CA&ceid=CA:en',
    'AU': 'https://news.google.com/rss?hl=en-AU&gl=AU&ceid=AU:en',
    'DE': 'https://news.google.com/rss?hl=en-DE&gl=DE&ceid=DE:en',
    'BR': 'https://news.google.com/rss?hl=en-BR&gl=BR&ceid=BR:en',
    'AR': 'https://news.google.com/rss?hl=en-AR&gl=AR&ceid=AR:en',
    'SE': 'https://news.google.com/rss?hl=en-SE&gl=SE&ceid=SE:en',
    'NL': 'https://news.google.com/rss?hl=en-NL&gl=NL&ceid=NL:en',
    'BE': 'https://news.google.com/rss?hl=en-BE&gl=BE&ceid=BE:en',
    'CH': 'https://news.google.com/rss?hl=en-CH&gl=CH&ceid=CH:en',
    'AT': 'https://news.google.com/rss?hl=en-AT&gl=AT&ceid=AT:en'
}

# Initialize Reddit API using PRAW
try:
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT', 'script:TrendingTopicsScript:1.0 (by u/yourusername)')
    )
except Exception as e:
    logging.error(f"Error initializing Reddit API: {e}")

# Initialize OpenAI API
openai.api_key = os.getenv('OPENAI_API_KEY')

# Suppress PRAW specific logs
logging.getLogger('praw').setLevel(logging.CRITICAL)


In [42]:
# Block 8: Define the Main Function
def main():
    # Step 1: User selects the country for Google Trends
    print("Select a country for Google Trends data:")
    for idx, country in top_countries.items():
        print(f"{idx}. {country['name']}")

    # Get user selection
    try:
        country_selection = int(input("Enter the number of the country you're interested in: "))
        if country_selection in top_countries:
            selected_country = top_countries[country_selection]
            print(f"You selected: {selected_country['name']}")
        else:
            logging.error("Invalid selection. Exiting.")
            return
    except ValueError:
        logging.error("Invalid input. Please enter a number. Exiting.")
        return

    # Step 2: Fetch trending topics for the selected country using PyTrends
    google_trends_topics = fetch_trending_topics_pytrends(selected_country)

    # If PyTrends fails or returns insufficient data, try RSS feeds
    if not google_trends_topics:
        logging.info("No trending topics found using PyTrends. Attempting to fetch from RSS feeds.")
        rss_feed_url = country_code_to_rss.get(selected_country['code'], 'https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en')
        google_trends_topics = fetch_trending_topics_from_rss(rss_feed_url)

    if not google_trends_topics:
        logging.error("No trending topics found using all available methods.")
        return

    # Generate descriptions for trending topics if missing
    for topic in google_trends_topics:
        if not topic['description'] or topic['description'].lower() == "no description available.":
            topic['description'] = generate_summary(topic['title'])

    # Display the trending topics with descriptions to the user
    print("\nCurrent Trending Topics in {}:".format(selected_country['name']))
    for idx, topic in enumerate(google_trends_topics, start=1):
        title = topic['title']
        description = topic['description'] if topic['description'] else "No description available."
        print(f"{idx}. {title} - {description}")

    # Allow the user to select a topic
    try:
        selected_idx = int(input("\nEnter the number of the topic you're interested in: "))
        if 1 <= selected_idx <= len(google_trends_topics):
            selected_topic = google_trends_topics[selected_idx - 1]['title']
            print(f"\nYou selected: {selected_topic}")
        else:
            logging.error("Invalid selection. Exiting.")
            return
    except ValueError:
        logging.error("Invalid input. Please enter a number. Exiting.")
        return

    # Step 3: Search for subreddits related to the topic
    related_subreddits = search_subreddits_for_topic(selected_topic, limit=5)
    if not related_subreddits:
        logging.error("No related subreddits found. Using RSS feed to fetch related stories.")
        # Fetch related stories from RSS feed
        related_stories = fetch_related_stories_from_rss(selected_country['code'], selected_topic)
        if not related_stories:
            logging.error("No related stories found from RSS feeds. Exiting.")
            return
        # Use related stories to generate scripts
        for story in related_stories:
            title = story['title']
            summary = story['summary']
            print(f"\nSummary: {summary}")
            script = generate_script_for_topic(title)
            print(f"Generated script for '{title}':\n{script}\n====\n")
        print("\nScript execution completed.")
        return

    # Display related subreddits to the user
    print("\nRelated Subreddits:")
    for idx, sub in enumerate(related_subreddits, start=1):
        print(f"{idx}. {sub['name']} - {sub['title']}")

    # Fetch and display top two posts from each subreddit
    print("\nTop 2 Posts from Each Subreddit:")
    all_posts = []
    for sub in related_subreddits:
        subreddit_name = sub['name']
        try:
            subreddit = reddit.subreddit(subreddit_name)
            top_posts = list(subreddit.hot(limit=2))
            print(f"\nSubreddit: {subreddit_name}")
            for post in top_posts:
                print(f"- {post.title} (Score: {post.score})")
                all_posts.append({'subreddit': subreddit_name, 'title': post.title, 'score': post.score, 'url': post.url})
        except Exception as e:
            logging.error(f"Error fetching posts from subreddit '{subreddit_name}': {e}")
            print(f"- Unable to fetch posts from subreddit '{subreddit_name}'.")

    if not all_posts:
        logging.error("No posts fetched from the related subreddits. Exiting.")
        return

    # Allow user to select specific posts to generate scripts for
    print("\nAll Fetched Posts:")
    for idx, post in enumerate(all_posts, start=1):
        print(f"{idx}. [{post['subreddit']}] {post['title']} (Score: {post['score']})")

    selected_posts_input = input("\nEnter the numbers of the posts you want to generate scripts for, separated by commas (e.g., 1,3,5): ")
    try:
        selected_indices = [int(i.strip()) - 1 for i in selected_posts_input.split(',') if i.strip().isdigit()]
        selected_posts = [all_posts[i] for i in selected_indices if 0 <= i < len(all_posts)]
        if not selected_posts:
            logging.error("No valid posts selected. Exiting.")
            return
    except ValueError:
        logging.error("Invalid input. Please enter numbers separated by commas. Exiting.")
        return

    # Fetch content and generate summaries for selected posts
    print(f"\nFetching content and generating summaries for selected posts...")
    for post in selected_posts:
        title = post['title']
        url = post['url']
        try:
            reddit_post = reddit.submission(url=url)
            if reddit_post.is_self:
                content = reddit_post.selftext
            else:
                # If the post is not a self post, attempt to fetch content via other means or skip
                content = reddit_post.title  # Fallback to title
            summary = generate_summary(content)
            post['summary'] = summary
        except Exception as e:
            logging.error(f"Error fetching content for post '{title}': {e}")
            post['summary'] = "No summary available."

    # Decide Whether to Check Stock Media Availability
    check_media = input("\nDo you want to check for stock media availability for the selected posts? (yes/no): ").strip().lower()
    if check_media not in ['yes', 'y']:
        print("\nSkipping stock media availability check.")
        # Proceed to generate scripts without checking
        for post in selected_posts:
            title = post['title']
            summary = post.get('summary', "No summary available.")
            print(f"\nSummary: {summary}")
            script = generate_script_for_topic(title)
            print(f"Generated script for '{title}':\n{script}\n====\n")
        print("\nScript execution completed.")
        return

    # Continue with stock media availability check
    print("\nChecking stock media availability for the selected posts...")
    request_count = 0
    scripts_generated = 0
    for post in selected_posts:
        title = post['title']
        summary = post.get('summary', "No summary available.")
        if request_count >= 190:
            logging.error("Approaching API rate limits. Waiting for 60 minutes before continuing...")
            time.sleep(3600)  # Wait for an hour
            request_count = 0
        total_media = check_stock_media_availability(title)
        request_count += 1
        if total_media >= 10:
            logging.info(f"Topic '{title}' has {total_media} stock media items available.")
            # Display summary before generating script
            print(f"\nSummary: {summary}")
            # Generate script for this topic
            script = generate_script_for_topic(title)
            print(f"Generated script for '{title}':\n{script}\n====\n")
            scripts_generated += 1
        else:
            logging.info(f"Not enough stock media for topic: {title}")
            print(f"\nSummary: {summary}")
            print(f"Not enough stock media for topic: {title}\n====\n")
    if scripts_generated == 0:
        logging.error("No topics with sufficient stock media were found.")
    else:
        logging.info(f"Generated {scripts_generated} script(s) based on available stock media.")
    print("\nScript execution completed.")


In [43]:
# Block 9: Run the Main Function
if __name__ == "__main__":
    main()


Select a country for Google Trends data:
1. India
2. United States
3. Nigeria
4. Mexico
5. Philippines
6. United Kingdom
7. France
8. Italy
9. South Africa
10. Spain
11. Canada
12. Australia
13. Germany
14. Brazil
15. Argentina
16. Sweden
17. Netherlands
18. Belgium
19. Switzerland
20. Austria
Enter the number of the country you're interested in: 14
You selected: Brazil


ERROR:root:Error fetching Google Trends data via PyTrends: 'https://news.google.com/rss?hl=en-BR&gl=BR&ceid=BR:en'



Current Trending Topics in Brazil:
1. US election: 3 days left – What polls say, what Harris and Trump are up to - Al Jazeera English - <ol><li><a href="https://news.google.com/rss/articles/CBMisAFBVV95cUxPWFNoYzFDdFNTUDBTdHZ3YzE3ZFRKcTR0LXUza0FEdDFJVzlxLWNWZlptX3VxSm1QY1M1elB0TUdmTDBaYVh4VUczckZDZDFySFFjNnBHUnZRVmNzZWNMZ3JQa0JqN0RKNVMySDFLcnpsZ3dxbWtJRlpxQWdoSmVacTM5U3daTGV3T2MxSzNpNkZTbk5QQ0N1ZUVzcTNieWxDQ3FnWmMwczZ2VTlRVU1GVtIBtgFBVV95cUxNcG00VU1OZ0w2b2dxZ2lFQ1dEYllZYUZiaHFtd2RGVmU1dHNBcVA1bnhlUGctbzFING5Nc0o2a3NGQzZDa3JYRml4djFCSm9XQ2VOSGRpYVRuTXM1UVEtUEN4MUhDeUh4T2ZIdG9SMHhxVmMybnBLUEpuSGlRb21vbnVSTm4tVnktTW9LTG1xcUUtMFpDSng2YnNDX2tPZU1rdlh3d2sybzEwdXh1Q20xNkFzYXlwQQ?oc=5" target="_blank">US election: 3 days left – What polls say, what Harris and Trump are up to</a>&nbsp;&nbsp;<font color="#6f6f6f">Al Jazeera English</font></li><li><a href="https://news.google.com/rss/articles/CBMihwFBVV95cUxOdVpRakJLS0VFNUlxUENxR3pGNkwtcUZsY3haYmtFTEVkbUJ2SHR0WXZNRm13X24tVTVsdnpZZVVWTW5pOFQtOGdz

KeyboardInterrupt: Interrupted by user