In [3]:
import pandas as pd
from nytimes_scraper.nyt_api import NytApi
from nytimes_scraper.comments import fetch_comments_by_article, comments_to_df

def read_urls_from_csv(file_path):
    """
    Reads a CSV file and extracts article URLs and titles.
    
    Parameters:
    file_path (str): Path to the CSV file containing article URLs.
    
    Returns:
    pd.DataFrame: A DataFrame containing article URLs and titles.
    """
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    print("Columns in the CSV file:", df.columns)
    articles_df = df[['url', 'title']].copy()
    return articles_df

def process_comment_thread(comment, article_url, article_title, all_comments):
    """
    Recursively process a comment and its replies, adding article metadata to each.
    
    Parameters:
    comment (dict): The comment to process
    article_url (str): The URL of the article
    article_title (str): The title of the article
    all_comments (list): List to store all processed comments
    """
    # Add article metadata to the current comment
    comment['article_url'] = article_url
    comment['article_title'] = article_title
    
    # Store the processed comment
    all_comments.append(comment)
    
    # Process replies if they exist
    if 'replies' in comment and comment['replies']:
        for reply in comment['replies']:
            process_comment_thread(reply, article_url, article_title, all_comments)

def fetch_comments_for_urls(api, articles_df):
    """
    Fetch comments for articles and preserve article metadata for all comments and replies.
    
    Parameters:
    api (NytApi): An instance of the NytApi class.
    articles_df (pd.DataFrame): DataFrame containing article URLs and titles.
    
    Returns:
    List[dict]: A list of comment dictionaries with article metadata.
    """
    all_comments = []
    
    for _, article in articles_df.iterrows():
        url = article['url']
        title = article['title']
        
        try:
            print(f"Fetching comments for: {title}")
            comments = fetch_comments_by_article(api, url)
            
            # Process each top-level comment and its replies
            for comment in comments:
                process_comment_thread(comment, url, title, all_comments)
                
        except Exception as e:
            print(f"Error fetching comments for {url}: {e}")
    
    return all_comments

def save_comments_to_csv(comments, output_file):
    """
    Save comments with article metadata to a CSV file.
    
    Parameters:
    comments (List[dict]): A list of comment dictionaries with article metadata.
    output_file (str): Path to the output CSV file.
    """
    if comments:
        # Convert the comments to a DataFrame
        comment_df = comments_to_df(comments)
        
        # Ensure article metadata columns are first
        metadata_cols = ['article_url', 'article_title']
        other_cols = [col for col in comment_df.columns if col not in metadata_cols]
        comment_df = comment_df[metadata_cols + other_cols]
        
        # Save the DataFrame to a CSV file
        comment_df.to_csv(output_file, index=False)
        print(f"Comments saved to {output_file}")
        
        # Print summary statistics
        print(f"\nSummary:")
        print(f"Total number of comments: {len(comment_df)}")
        print(f"Number of articles with comments: {comment_df['article_url'].nunique()}")
        print(f"Number of top-level comments: {len(comment_df[comment_df['parentID'].isna()])}")
        print(f"Number of replies: {len(comment_df[comment_df['parentID'].notna()])}")
        
        # Verify metadata consistency
        missing_metadata = comment_df[comment_df['article_url'].isna() | comment_df['article_title'].isna()]
        if not missing_metadata.empty:
            print(f"\nWarning: {len(missing_metadata)} comments are missing article metadata")
    else:
        print("No comments found!")

def main(input_csv, output_csv, api_key):
    """
    Main function to orchestrate the comment collection process.
    
    Parameters:
    input_csv (str): Path to the input CSV file containing article URLs.
    output_csv (str): Path to the output CSV file to save comments.
    api_key (str): NYTimes API key.
    """
    api = NytApi(api_key)
    articles_df = read_urls_from_csv(input_csv)
    comments = fetch_comments_for_urls(api, articles_df)
    save_comments_to_csv(comments, output_csv)

# Example usage
if __name__ == "__main__":
    input_csv = "currentarticles.csv"
    output_csv = "nytimes_comments_with_metadata.csv"
    api_key = "v7OJCVAdI6H2W4pgDGvXIatrLkA4KX6G" # Replace with your actual API key
    
    main(input_csv, output_csv, api_key)

SyntaxError: invalid syntax (2908207271.py, line 5)