# YouTube Comments Analysis Notebook

This notebook provides an **experimental pipeline** for extracting, translating, analyzing sentiment, and summarizing YouTube video comments using the YouTube Data API v3.

### Features
- Fetch top-level comments and replies from a YouTube video.
- Translate non-English comments to English using Google Translate.
- Perform sentiment analysis using TextBlob.
- Summarize viewer opinions and extract common topics/suggestions.
- Export results to a CSV file and generate a summary report.

### Setup Instructions (Recommended for Google Colab)

1. **Use Google Colab**:
   - Click the "Open in Colab" button (or upload this notebook to [https://colab.research.google.com](https://colab.research.google.com)).

2. **Add Your YouTube API Key**:
   - In the `main()` function, replace the `api_key` value with your **YouTube Data API v3 key**:
     ```python
     api_key = "YOUR_YOUTUBE_API_KEY"
     ```

3. **Set the Target YouTube Video**:
   - Replace `video_id` with the ID of the video you want to analyze.
   - You can find it at the end of any YouTube URL:
     ```
     Example:
     URL: https://www.youtube.com/watch?v=2MWQZ4CVbks
     → video_id = "2MWQZ4CVbks"
     ```

---

> Note: This notebook accesses online data from YouTube. It may take some time depending on the number of comments and replies.


###  Install Required Libraries

In [None]:
!pip install google-api-python-client pandas textblob googletrans==4.0.0-rc1 nltk gensim sumy

### Import Libraries and Load Environment Variables

In [None]:
from googleapiclient.discovery import build
import pandas as pd
from time import sleep
import traceback
import os

### Function to Fetch Comments from YouTube

In [None]:
def get_comments(api_key, video_id):
    youtube = build('youtube', 'v3', developerKey=api_key)

    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        textFormat="plainText"
    )

    # Create empty DataFrame to store comments
    df = pd.DataFrame(columns=['comment', 'replies', 'date', 'user_name'])

    while request:
        comments, replies, dates, user_names = [], [], [], []

        try:
            response = request.execute()

            for item in response['items']:
                comment_data = item['snippet']['topLevelComment']['snippet']
                comments.append(comment_data['textDisplay'])
                user_names.append(comment_data['authorDisplayName'])
                dates.append(comment_data['publishedAt'])

                reply_list = [
                    reply['snippet']['textDisplay']
                    for reply in item.get('replies', {}).get('comments', [])
                ]
                replies.append(reply_list)

            # Append the new data to the main DataFrame
            df2 = pd.DataFrame({
                "comment": comments,
                "replies": replies,
                "user_name": user_names,
                "date": dates
            })

            df = pd.concat([df, df2], ignore_index=True)
            df.to_csv(f"{video_id}_user_comments.csv", index=False, encoding='utf-8')

            request = youtube.commentThreads().list_next(request, response)
            print("Fetched next page of comments...")
            sleep(2)

        except Exception as e:
            print(f"Error: {e}\n{traceback.format_exc()}")
            print("Pausing for 10 seconds...")
            sleep(10)
            df.to_csv(f"{video_id}_user_comments.csv", index=False, encoding='utf-8')
            break


### Main Function

In [None]:
def main():
    api_key = # YOUR_API_KEY
    video_id = # YOUR_VIDEO_ID
    get_comments(api_key, video_id)

if __name__ == "__main__":
    main()


### Import NLP Libraries and Download NLTK Data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from googletrans import Translator
import re
from collections import Counter
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

nltk.download('stopwords')
nltk.download('punkt')


### Text Preprocessing Utilities

In [None]:
def load_data(csv_file):
    try:
        df = pd.read_csv(csv_file)
        if 'comment' not in df.columns:
            raise ValueError("CSV must contain a 'comment' column.")
        return df
    except Exception as e:
        print(f"Failed to load CSV: {e}")
        return None

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text.lower().strip()

def translate_text(text, target_lang="en"):
    translator = Translator()
    try:
        detected = translator.detect(text).lang
        if detected != target_lang:
            return translator.translate(text, dest=target_lang).text
        return text
    except Exception as e:
        print(f"Translation error: {e}")
        return text


### Sentiment Analysis and Suggestion Extraction

In [None]:
def analyze_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    return "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"

def extract_suggestions(df):
    keywords = ["should", "make", "video on", "talk about", "cover", "do a video on"]
    return [comment for comment in df['translated_comment'] if any(k in comment for k in keywords)]


### Text Summarization

In [None]:
def summarize_text(text, num_sentences=5):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    return " ".join(str(sentence) for sentence in summarizer(parser.document, num_sentences))


### Generate Full Comment Summary Report

In [None]:
def generate_summary(df):
    if df is None:
        return "No data to summarize."

    df['cleaned_comment'] = df['comment'].astype(str).apply(clean_text)
    df['translated_comment'] = df['cleaned_comment'].apply(translate_text)
    df['sentiment'] = df['translated_comment'].apply(analyze_sentiment)

    sentiment_counts = df['sentiment'].value_counts()
    suggestions = extract_suggestions(df)

    stop_words = set(stopwords.words('english'))
    words = " ".join(df['translated_comment']).split()
    common_words = Counter(w for w in words if w not in stop_words).most_common(10)

    text = ". ".join(df['translated_comment'])
    summary_text = summarize_text(text) if len(text) > 200 else "Not enough text to summarize."

    return f"""
    **YouTube Comments Summary**

    - **Total Comments:** {len(df)}
    - **Positive:** {sentiment_counts.get("Positive", 0)}
    - **Negative:** {sentiment_counts.get("Negative", 0)}
    - **Neutral:** {sentiment_counts.get("Neutral", 0)}

    **Audience Summary:**
    {summary_text}

    **Viewer Suggestions:**
    {suggestions[:5] if suggestions else "No major suggestions found."}

    **Common Words:**
    {', '.join(word for word, _ in common_words)}
    """


### Run Summary Analysis

In [None]:
csv_file = "/content/comments.csv"  # Adjust path as needed
df = load_data(csv_file)
summary_result = generate_summary(df)
print(summary_result)
