# Reddit Data Analysis with Pandas

This notebook provides a template for analyzing Reddit data exported from the pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the data
posts_df = pd.read_csv('reddit_posts.csv')
comments_df = pd.read_csv('reddit_comments.csv')
all_data_df = pd.read_csv('reddit_all_data.csv')

print(f'Loaded {len(posts_df)} posts and {len(comments_df)} comments')
print(f'Total items: {len(all_data_df)}')

In [None]:
# Basic data exploration
print('Data Overview:')
print(all_data_df.info())
print('
First few rows:')
print(all_data_df.head())

In [None]:
# Subreddit analysis
subreddit_counts = all_data_df['subreddit'].value_counts()
print('Posts/Comments by Subreddit:')
print(subreddit_counts)

plt.figure(figsize=(12, 6))
subreddit_counts.plot(kind='bar')
plt.title('Posts/Comments by Subreddit')
plt.xlabel('Subreddit')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Score analysis
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
all_data_df['score'].hist(bins=30)
plt.title('Score Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
all_data_df.boxplot(column='score', by='subreddit')
plt.title('Score by Subreddit')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
all_data_df['word_count'].hist(bins=30)
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Cluster analysis (if available)
if 'cluster_id' in all_data_df.columns:
    cluster_counts = all_data_df['cluster_id'].value_counts().sort_index()
    print('Cluster Distribution:')
    print(cluster_counts)
    
    plt.figure(figsize=(12, 6))
    cluster_counts.plot(kind='bar')
    plt.title('Items per Cluster')
    plt.xlabel('Cluster ID')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Time series analysis
all_data_df['created_utc'] = pd.to_datetime(all_data_df['created_utc'])
all_data_df['date'] = all_data_df['created_utc'].dt.date

daily_counts = all_data_df.groupby('date').size()
plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line')
plt.title('Daily Activity')
plt.xlabel('Date')
plt.ylabel('Number of Posts/Comments')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()