In [None]:
!pip install newspaper3k

In [None]:
!pip install 'lxml[html_clean]'

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')


# Noticias

In [None]:
import pandas as pd
from collections import defaultdict
import newspaper
from newspaper import Article, Config
from datetime import datetime
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import re
import json
import logging
from pathlib import Path

class ThemeProcessor:
    @staticmethod
    def process_keywords_from_csv(csv_path):
        """Process keywords from CSV file"""
            df = pd.read_csv(csv_path, header=None)  # Asumimos que no hay headers, si los hay, ajustar
            themes_dict = defaultdict(list)

            # Por cada columna en el DataFrame
            for col in df.columns:
                current_theme = None
                for val in df[col].dropna():
                    line = str(val).strip()

                    # Si la línea inicia con '===' y finaliza con '===', es un nuevo tema
                    if line.startswith('===') and line.endswith('==='):
                        # Extraemos el nombre del tema quitando los '==='
                        # Por ejemplo: === Inclusive growth, sustainable development and well-being ===
                        # Queremos quedarnos solo con el texto interno
                        theme_name = line.strip('=').strip()
                        current_theme = theme_name
                        if current_theme not in themes_dict:
                            themes_dict[current_theme] = []
                    
                    # Si la línea empieza con '-', es un subtema asociado al tema actual
                    elif line.startswith('-') and current_theme:
                        subtopic = line.lstrip('-').strip()
                        if subtopic:
                            themes_dict[current_theme].append(subtopic)

            # Remover duplicados en las listas (opcional)
            for theme in themes_dict:
                themes_dict[theme] = list(set(themes_dict[theme]))

            return dict(themes_dict)

class AINewsAnalyzer:
    def __init__(self, themes_csv_path, news_sources=None):
        """
        Initialize the AI News Analyzer with themes from CSV

        Args:
            themes_csv_path (str): Path to CSV file containing themes and keywords
            news_sources (list): Optional list of news sources to analyze
        """
        self.themes = ThemeProcessor.process_keywords_from_csv(themes_csv_path)
        self.news_sources = news_sources or [
            'https://efe.com/en/',
            'http://www.wired.com',
            'http://www.bbc.com',
            'http://www.cnn.com',
            'http://www.reuters.com',
            'http://www.theguardian.com',
            'http://www.nytimes.com',
            'https://www.afp.com',
            'https://www.wired.com',
            'https://www.theguardian.com/technology',

        ]

        self.ai_related_terms = [
            'artificial intelligence', 'AI', 'machine learning', 'deep learning',
            'neural network', 'AI model', 'large language model', 'LLM',
            'ChatGPT', 'GPT', 'artificial neural', 'AI system'
        ]

        self.articles_data = []
        self.debug_stats = {
            'total_urls_found': 0,
            'download_failures': 0,
            'parsing_failures': 0,
            'ai_related_found': 0,
            'theme_matched': 0
        }
        self.setup_logging()

    def setup_logging(self):
        """Configure logging"""
        log_dir = Path('logs')
        log_dir.mkdir(exist_ok=True)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_dir / 'ai_news_analyzer.log'),
                logging.StreamHandler()
            ]
        )

    def is_ai_related(self, text, title):
        """Check if article is AI-related and matches themes"""
        combined_text = (text + " " + title).lower()

        # First check if it's AI-related
        if not any(term.lower() in combined_text for term in self.ai_related_terms):
            return False

        # Then check if it matches any of our theme keywords
        for theme, keywords in self.themes.items():
            if any(keyword.lower() in combined_text for keyword in keywords):
                return True

        return False

    def download_and_parse_article(self, article_url):
        """Download and parse a single article with better error handling"""
        try:
            print(f"\nAttempting to process: {article_url}")  # Debug print

            article = Article(article_url)
            try:
                article.download()
                time.sleep(1)  # Increased delay to be more polite
            except Exception as e:
                self.debug_stats['download_failures'] += 1
                print(f"Download failed: {str(e)}")
                return None

            try:
                article.parse()
                article.nlp()
            except Exception as e:
                self.debug_stats['parsing_failures'] += 1
                print(f"Parsing failed: {str(e)}")
                return None

            # Check if we got actual content
            if not article.text or len(article.text) < 100:
                print("Article too short or empty")
                return None

            if self.is_ai_related(article.text, article.title):
                self.debug_stats['ai_related_found'] += 1
                print("AI-related article found!")

                # Match themes and keywords
                matched_themes = {}
                text = (article.text + " " + article.title).lower()

                for theme, keywords in self.themes.items():
                    matched_keywords = []
                    for keyword in keywords:
                        if keyword.lower() in text:
                            matched_keywords.append(keyword)
                    if matched_keywords:
                        matched_themes[theme] = matched_keywords

                if matched_themes:
                    self.debug_stats['theme_matched'] += 1
                    return {
                        'url': article_url,
                        'title': article.title,
                        'text': article.text,
                        'summary': article.summary,
                        'keywords': article.keywords,
                        'publish_date': article.publish_date.strftime('%Y-%m-%d') if article.publish_date else None,
                        'authors': article.authors,
                        'matched_themes': matched_themes,
                        'source': re.findall(r'https?://(?:www\.)?([^/]+)', article_url)[0]
                    }
            else:
                print("Not AI-related")

            return None

        except Exception as e:
            logging.error(f"Error processing {article_url}: {str(e)}")
            return None

    def analyze_sources(self, max_articles_per_source=50, start_year=2014):
        """Analyze news sources with better debugging"""
        for source_url in tqdm(self.news_sources, desc="Processing news sources"):
            try:
                print(f"\nProcessing source: {source_url}")

                config = Config()
                config.request_timeout = 30  # Increased timeout
                config.memoize_articles = False
                config.fetch_images = False

                # Build source object
                source = newspaper.build(
                    source_url,
                    config=config,
                    language='en',
                    number_threads=1
                )

                print(f"Found {len(source.articles)} articles at {source_url}")
                self.debug_stats['total_urls_found'] += len(source.articles)

                # Get article URLs
                ai_related_urls = []
                for article in tqdm(source.articles[:max_articles_per_source * 2],
                                  desc=f"Scanning articles from {source_url}"):
                    if not article.url:
                        continue

                    try:
                        article.download()
                        article.parse()

                        if not article.text or len(article.text) < 100:
                            continue

                        if article.publish_date:
                            article_year = article.publish_date.year
                            if start_year <= article_year <= datetime.now().year:
                                if self.is_ai_related(article.text, article.title):
                                    ai_related_urls.append(article.url)
                                    print(f"Found AI article: {article.url}")

                        if len(ai_related_urls) >= max_articles_per_source:
                            break

                    except Exception as e:
                        print(f"Error processing article: {str(e)}")
                        continue

                print(f"Found {len(ai_related_urls)} AI-related articles")

                # Process the found articles
                with ThreadPoolExecutor(max_workers=3) as executor:
                    results = list(executor.map(self.download_and_parse_article, ai_related_urls))

                valid_results = [r for r in results if r is not None]
                self.articles_data.extend(valid_results)

                print(f"Successfully processed {len(valid_results)} articles")

            except Exception as e:
                logging.error(f"Error processing source {source_url}: {str(e)}")
                continue

    def analyze_content(self):
        """Analyze collected articles for themes"""
        analysis_results = defaultdict(lambda: defaultdict(int))
        articles_by_theme = defaultdict(list)

        for article in tqdm(self.articles_data, desc="Analyzing articles"):
            for theme, keywords in article['matched_themes'].items():
                analysis_results[theme]['articles_count'] += 1
                analysis_results[theme]['keyword_occurrences'] += len(keywords)

                articles_by_theme[theme].append({
                    'url': article['url'],
                    'title': article['title'],
                    'publish_date': article['publish_date'],
                    'keywords_found': keywords
                })

        return analysis_results, articles_by_theme

    def save_results(self, analysis_results, articles_by_theme, output_prefix='ai_news_analysis'):
        """Save analysis results to files"""
        # Ensure output directory exists
        output_dir = Path('results')
        output_dir.mkdir(exist_ok=True)

        # Create complete report
        report = {
            'summary': {
                'total_articles': len(self.articles_data),
                'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'themes_analyzed': list(self.themes.keys())
            },
            'theme_analysis': {
                theme: {
                    'articles_count': data['articles_count'],
                    'keyword_occurrences': data['keyword_occurrences'],
                    'articles': articles_by_theme[theme]
                }
                for theme, data in analysis_results.items()
            }
        }

        # Save files
        with open(output_dir / f'{output_prefix}_report.json', 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)

        # Create theme analysis DataFrame
        theme_df = pd.DataFrame([
            {
                'theme': theme,
                'articles_count': data['articles_count'],
                'keyword_occurrences': data['keyword_occurrences'],
                'percentage_of_total': (data['articles_count'] / len(self.articles_data) * 100) if self.articles_data else 0
            }
            for theme, data in analysis_results.items()
        ])
        theme_df.to_csv(output_dir / f'{output_prefix}_theme_analysis.csv', index=False)

        # Create articles DataFrame
        articles_df = pd.DataFrame([
            {
                'theme': theme,
                'title': article['title'],
                'url': article['url'],
                'publish_date': article['publish_date'],
                'keywords': ', '.join(article['keywords_found'])
            }
            for theme, articles in articles_by_theme.items()
            for article in articles
        ])
        articles_df.to_csv(output_dir / f'{output_prefix}_articles.csv', index=False)

        logging.info(f"Results saved in 'results' directory with prefix '{output_prefix}'")
        return report

# Usage example
def main():
    # Initialize analyzer with your CSV file
    analyzer = AINewsAnalyzer('Criterios.csv')

    # Analyze sources
    analyzer.analyze_sources(max_articles_per_source=100, start_year=2014)

    # Analyze content
    analysis_results, articles_by_theme = analyzer.analyze_content()

    # Save and get report
    report = analyzer.save_results(analysis_results, articles_by_theme)

    # Print summary
    print("\nAnalysis Summary:")
    print(f"Total articles analyzed: {report['summary']['total_articles']}")
    print("\nResults by theme:")
    for theme, data in report['theme_analysis'].items():
        print(f"\n{theme}:")
        print(f"  Articles: {data['articles_count']}")
        print(f"  Keyword occurrences: {data['keyword_occurrences']}")

if __name__ == "__main__":
    main()

# Foros

In [None]:
class ThemeProcessor:
    @staticmethod
    def process_keywords_from_csv(csv_path):
        """Process keywords from CSV file"""
        df = pd.read_csv(csv_path, header=None)  # Asumimos que no hay headers, si los hay, ajustar
            themes_dict = defaultdict(list)

            # Por cada columna en el DataFrame
            for col in df.columns:
                current_theme = None
                for val in df[col].dropna():
                    line = str(val).strip()

                    # Si la línea inicia con '===' y finaliza con '===', es un nuevo tema
                    if line.startswith('===') and line.endswith('==='):
                        # Extraemos el nombre del tema quitando los '==='
                        # Por ejemplo: === Inclusive growth, sustainable development and well-being ===
                        # Queremos quedarnos solo con el texto interno
                        theme_name = line.strip('=').strip()
                        current_theme = theme_name
                        if current_theme not in themes_dict:
                            themes_dict[current_theme] = []
                    
                    # Si la línea empieza con '-', es un subtema asociado al tema actual
                    elif line.startswith('-') and current_theme:
                        subtopic = line.lstrip('-').strip()
                        if subtopic:
                            themes_dict[current_theme].append(subtopic)

            # Remover duplicados en las listas (opcional)
            for theme in themes_dict:
                themes_dict[theme] = list(set(themes_dict[theme]))

            return dict(themes_dict)


In [None]:
!pip install praw

In [None]:
!pip install PyGithub

In [None]:
import praw
from github import Github
import requests
import logging
from datetime import datetime
from pathlib import Path
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import time
import re

class ForumAnalyzer:
    def __init__(self, themes_csv_path, reddit_credentials=None, github_token=None):
        """
        Initialize the Forum Analyzer

        Args:
            themes_csv_path (str): Path to CSV file containing themes and keywords
            reddit_credentials (dict): Dictionary with Reddit API credentials
            github_token (str): GitHub personal access token
        """
        self.themes = ThemeProcessor.process_keywords_from_csv(themes_csv_path)
        self.articles_data = []

        # Initialize Reddit client if credentials provided
        self.reddit = None
        if reddit_credentials:
            self.reddit = praw.Reddit(
                client_id=reddit_credentials['client_id'],
                client_secret=reddit_credentials['client_secret'],
                user_agent=reddit_credentials['user_agent']
            )

        # Initialize GitHub client if token provided
        self.github = None
        if github_token:
            self.github = Github(github_token)

        # Initialize logging
        self.setup_logging()

        # AI-related terms (inherited from AINewsAnalyzer)
        self.ai_related_terms = [
            'artificial intelligence', 'AI', 'machine learning', 'deep learning',
            'neural network', 'AI model', 'large language model', 'LLM',
            'ChatGPT', 'GPT', 'artificial neural', 'AI system'
        ]

        # Debug stats
        self.debug_stats = defaultdict(int)

    def setup_logging(self):
        """Configure logging"""
        log_dir = Path('logs')
        log_dir.mkdir(exist_ok=True)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_dir / 'forum_analyzer.log'),
                logging.StreamHandler()
            ]
        )

    def is_ai_related(self, text, title=""):
        """Check if content is AI-related and matches themes"""
        combined_text = (text + " " + title).lower()

        # First check if it's AI-related
        if not any(term.lower() in combined_text for term in self.ai_related_terms):
            return False

        # Then check if it matches any of our theme keywords
        for theme, keywords in self.themes.items():
            if any(keyword.lower() in combined_text for keyword in keywords):
                return True

        return False

    def analyze_reddit(self, subreddits=['artificial', 'MachineLearning'],
                      time_filter='year', limit=1000):
        """
        Analyze Reddit posts from specified subreddits

        Args:
            subreddits (list): List of subreddit names to analyze
            time_filter (str): One of 'day', 'week', 'month', 'year', 'all'
            limit (int): Maximum number of posts to analyze per subreddit
        """
        if not self.reddit:
            logging.error("Reddit client not initialized. Please provide credentials.")
            return

        for subreddit_name in tqdm(subreddits, desc="Processing subreddits"):
            try:
                subreddit = self.reddit.subreddit(subreddit_name)

                # Get top posts
                for post in tqdm(subreddit.top(time_filter=time_filter, limit=limit),
                               desc=f"Analyzing posts from r/{subreddit_name}"):

                    # Combine post title, content and top comments
                    post_text = f"{post.title} {post.selftext}"

                    # Add top comments
                    post.comments.replace_more(limit=0)
                    comments_text = " ".join([comment.body for comment in post.comments.list()[:10]])

                    combined_text = post_text + " " + comments_text

                    if self.is_ai_related(combined_text, post.title):
                        self.debug_stats['reddit_ai_related'] += 1

                        # Match themes and keywords
                        matched_themes = {}
                        text = combined_text.lower()

                        for theme, keywords in self.themes.items():
                            matched_keywords = [k for k in keywords if k.lower() in text]
                            if matched_keywords:
                                matched_themes[theme] = matched_keywords

                        if matched_themes:
                            self.articles_data.append({
                                'url': f"https://reddit.com{post.permalink}",
                                'title': post.title,
                                'text': combined_text,
                                'summary': post.selftext[:500] if post.selftext else "",
                                'publish_date': datetime.fromtimestamp(post.created_utc).strftime('%Y-%m-%d'),
                                'author': str(post.author),
                                'matched_themes': matched_themes,
                                'source': f"reddit/r/{subreddit_name}",
                                'score': post.score,
                                'num_comments': post.num_comments
                            })

            except Exception as e:
                logging.error(f"Error processing subreddit {subreddit_name}: {str(e)}")
                continue

    def analyze_github(self, query='artificial intelligence', sort='stars',
                      max_repos=100, min_stars=100):
        """
        Analyze GitHub repositories and their discussions

        Args:
            query (str): Search query for repositories
            sort (str): How to sort results ('stars', 'forks', 'updated')
            max_repos (int): Maximum number of repositories to analyze
            min_stars (int): Minimum number of stars for a repository
        """
        if not self.github:
            logging.error("GitHub client not initialized. Please provide token.")
            return

        try:
            # Search repositories
            repositories = self.github.search_repositories(
                query=f"{query} stars:>={min_stars}",
                sort=sort,
                order='desc'
            )

            for repo in tqdm(repositories[:max_repos], desc="Analyzing GitHub repositories"):
                try:
                    # Combine repository description, readme, and discussions
                    repo_text = f"{repo.description or ''}"

                    try:
                        readme = repo.get_readme().decoded_content.decode()
                        repo_text += " " + readme
                    except:
                        pass

                    # Get discussions if available
                    if repo.has_discussions:
                        discussions = repo.get_discussions()
                        for discussion in discussions[:10]:  # Get first 10 discussions
                            repo_text += f" {discussion.title} {discussion.body}"

                    if self.is_ai_related(repo_text, repo.name):
                        self.debug_stats['github_ai_related'] += 1

                        # Match themes and keywords
                        matched_themes = {}
                        text = repo_text.lower()

                        for theme, keywords in self.themes.items():
                            matched_keywords = [k for k in keywords if k.lower() in text]
                            if matched_keywords:
                                matched_themes[theme] = matched_keywords

                        if matched_themes:
                            self.articles_data.append({
                                'url': repo.html_url,
                                'title': repo.name,
                                'text': repo_text[:5000],  # Limit text length
                                'summary': repo.description or "",
                                'publish_date': repo.created_at.strftime('%Y-%m-%d'),
                                'author': repo.owner.login,
                                'matched_themes': matched_themes,
                                'source': 'github',
                                'stars': repo.stargazers_count,
                                'forks': repo.forks_count
                            })

                except Exception as e:
                    logging.error(f"Error processing repository {repo.full_name}: {str(e)}")
                    continue

        except Exception as e:
            logging.error(f"Error searching GitHub repositories: {str(e)}")





    def save_results(self, output_prefix='forum_analysis'):
        """Save analysis results to files"""
        output_dir = Path('results')
        output_dir.mkdir(exist_ok=True)

        # Create complete report
        report = {
            'summary': {
                'total_posts': len(self.articles_data),
                'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'themes_analyzed': list(self.themes.keys()),
                'debug_stats': dict(self.debug_stats)
            },
            'theme_analysis': defaultdict(lambda: {'posts': [], 'count': 0})
        }

        # Organize posts by theme
        for post in self.articles_data:
            for theme in post['matched_themes'].keys():
                report['theme_analysis'][theme]['posts'].append({
                    'url': post['url'],
                    'title': post['title'],
                    'source': post['source'],
                    'publish_date': post['publish_date']
                })
                report['theme_analysis'][theme]['count'] += 1

        # Convert defaultdict to regular dict for JSON serialization
        report['theme_analysis'] = dict(report['theme_analysis'])

        # Save files
        with open(output_dir / f'{output_prefix}_report.json', 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)

        # Create DataFrame for analysis
        posts_df = pd.DataFrame(self.articles_data)
        posts_df.to_csv(output_dir / f'{output_prefix}_posts.csv', index=False)

        logging.info(f"Results saved in 'results' directory with prefix '{output_prefix}'")
        return report

# Usage example
def main():
    # Reddit API credentials (you'll need to get these from Reddit)
    reddit_credentials = {
        'client_id': 'vopzVv6U6FpL5xZXQlcifA',
        'client_secret': 'rrY4l3SyjVWWFF8Tfx4hJSGTFyVb8A',
        'user_agent': 'python:ai_forum_analyzer:v1.0:Investigacion'
    }

    # GitHub personal access token (you'll need to create this)
    github_token = ''

    # Initialize analyzer
    analyzer = ForumAnalyzer(
        'Criterios.csv',
        reddit_credentials=reddit_credentials,
        github_token=github_token
    )

    # Analyze different platforms
    analyzer.analyze_reddit()
    analyzer.analyze_github()


    # Save results
    report = analyzer.save_results()

    # Print summary
    print("\nAnalysis Summary:")
    print(f"Total posts analyzed: {report['summary']['total_posts']}")
    print("\nResults by theme:")
    for theme, data in report['theme_analysis'].items():
        print(f"\n{theme}:")
        print(f"  Posts: {data['count']}")

if __name__ == "__main__":
    main()