# Import Libraries and Load Data
Import necessary libraries such as pandas, matplotlib, seaborn, and the Gemini API client. Load the processed_comments DataFrame from a CSV file.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google import genai
from google.genai import types
import os

# Load the processed_comments DataFrame from a CSV file
processed_comments = pd.read_csv('processed_comments.csv')

# Display the first few rows of the DataFrame
processed_comments.head()

FileNotFoundError: [Errno 2] No such file or directory: 'processed_comments.csv'

# Stratified Sampling of Comments
Perform stratified sampling to extract a representative sample of comments from each of the five subreddits. Ensure each subreddit is a stratum.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Define the subreddits of interest
subreddits_of_interest = ['worldnews', 'news', 'politics', 'science', 'technology']

# Filter the DataFrame to include only the desired subreddits
filtered_df = processed_comments[processed_comments['subreddit'].isin(subreddits_of_interest)]

# Perform stratified sampling
stratified_sample = filtered_df.groupby('subreddit', group_keys=False).apply(lambda x: x.sample(min(len(x), 200), random_state=42))

# Display the stratified sample
stratified_sample.head()

# Function to generate keywords using Gemini API
def generate_keywords(text):
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    model = "gemini-2.0-flash"
    contents = [types.Content(role="user", parts=[types.Part.from_text(text=text)])]
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        top_p=0,
        top_k=1,
        max_output_tokens=100,
        response_mime_type="text/plain",
    )
    complete_response = ""
    for chunk in client.models.generate_content_stream(model=model, contents=contents, config=generate_content_config):
        complete_response += chunk.text
    return complete_response

# Extract keywords for each subreddit
keywords_by_subreddit = {}
for subreddit in subreddits_of_interest:
    subreddit_comments = stratified_sample[stratified_sample['subreddit'] == subreddit]['comment_body'].tolist()
    combined_text = " ".join(subreddit_comments)
    keywords = generate_keywords(combined_text)
    keywords_by_subreddit[subreddit] = keywords.split()

# Display the keywords for each subreddit
keywords_by_subreddit

# Define a Function to Extract Keywords Using the Gemini API
Define a function that takes a list of comments as input and uses the Gemini API to extract keywords unique to that set of comments. The prompt should instruct the model to identify the most salient keywords that differentiate the comments from a general corpus.

In [None]:
# Define a function to extract keywords using the Gemini API
def extract_keywords(comments):
    """
    Extracts keywords unique to a set of comments using the Gemini API.

    Args:
        comments (list): A list of comment strings.

    Returns:
        list: A list of keywords unique to the comments.
    """
    # Combine all comments into a single text
    combined_text = " ".join(comments)
    
    # Generate keywords using the Gemini API
    keywords_text = generate_keywords(combined_text)
    
    # Split the keywords text into a list of keywords
    keywords = keywords_text.split()
    
    return keywords

# Example usage
example_comments = stratified_sample[stratified_sample['subreddit'] == 'worldnews']['comment_body'].tolist()
example_keywords = extract_keywords(example_comments)
example_keywords

# Apply Keyword Extraction to Each Subreddit
Apply the keyword extraction function to the comments from each subreddit. Store the keywords for each subreddit.

In [None]:
# Apply the keyword extraction function to the comments from each subreddit
keywords_by_subreddit = {}

for subreddit in subreddits_of_interest:
    subreddit_comments = stratified_sample[stratified_sample['subreddit'] == subreddit]['comment_body'].tolist()
    keywords = extract_keywords(subreddit_comments)
    keywords_by_subreddit[subreddit] = keywords

# Display the keywords for each subreddit
keywords_by_subreddit

# Sentiment Analysis using Gemini API
Define a function to perform sentiment analysis on the comments using the Gemini API. Apply this function to each comment in the stratified sample and store the sentiment scores.

In [None]:
# Define a function to perform sentiment analysis using the Gemini API
def perform_sentiment_analysis(text):
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    model = "gemini-2.0-flash"
    contents = [types.Content(role="user", parts=[types.Part.from_text(text=text)])]
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        top_p=0,
        top_k=1,
        max_output_tokens=100,
        response_mime_type="text/plain",
    )
    complete_response = ""
    for chunk in client.models.generate_content_stream(model=model, contents=contents, config=generate_content_config):
        complete_response += chunk.text
    return complete_response

# Apply the sentiment analysis function to each comment in the stratified sample
stratified_sample['sentiment'] = stratified_sample['comment_body'].apply(perform_sentiment_analysis)

# Display the stratified sample with sentiment scores
stratified_sample.head()

# Create Visualizations
Create visualizations to compare the sentiment across different subreddits. Use bar plots, box plots, or other appropriate charts to display the distribution of sentiment scores for each subreddit. Create word clouds for each subreddit using the extracted keywords to visualize the most prominent topics in each community.

In [None]:
import wordcloud

# Create visualizations to compare the sentiment across different subreddits

# 1. Bar plot for average sentiment score by subreddit
plt.figure(figsize=(10, 6))
sns.barplot(x='subreddit', y='sentiment', data=stratified_sample, estimator=np.mean, ci=None)
plt.title('Average Sentiment Score by Subreddit')
plt.xlabel('Subreddit')
plt.ylabel('Average Sentiment Score')
plt.show()

# 2. Box plot for sentiment score distribution by subreddit
plt.figure(figsize=(12, 8))
sns.boxplot(x='subreddit', y='sentiment', data=stratified_sample)
plt.title('Sentiment Score Distribution by Subreddit')
plt.xlabel('Subreddit')
plt.ylabel('Sentiment Score')
plt.show()

# 3. Word clouds for each subreddit using the extracted keywords
for subreddit, keywords in keywords_by_subreddit.items():
    wordcloud_text = " ".join(keywords)
    wc = wordcloud.WordCloud(width=800, height=400, background_color='white').generate(wordcloud_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.title(f'Word Cloud for /r/{subreddit}')
    plt.axis('off')
    plt.show()