In [None]:
import pandas as pd
import plotly.express as px

# Load data with proper delimiter and error handling
df = pd.read_csv("../data/processed/part-00000-55078bd3-eca0-4078-8064-721c27ee24fc-c000.csv.csv", on_bad_lines='skip', delimiter=",")

# Convert 'published_at' to datetime format
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

# Create a bar chart for the most liked comments
fig = px.bar(df.sort_values(by='likes', ascending=False).head(10), 
             x='author', y='likes', 
             title="Top 10 Most Liked Comments", 
             labels={'author': 'Author', 'likes': 'Likes'}, 
             text='comment')
fig.show()

# Create a histogram for comment length distribution
df['comment_length'] = df['comment'].astype(str).apply(len)
fig = px.histogram(df, x="comment_length", title="Comment Length Distribution", nbins=30)
fig.show()

# Create a scatter plot for likes vs. comment length
fig = px.scatter(df, x="comment_length", y="likes", 
                 title="Likes vs. Comment Length", 
                 labels={'comment_length': 'Comment Length', 'likes': 'Likes'}, 
                 color="likes")
fig.show()

# Create a time series plot for comments over time
df['date'] = df['published_at'].dt.date
fig = px.line(df.groupby('date').size().reset_index(name='count'), 
              x='date', y='count', 
              title="Number of Comments Over Time", 
              labels={'date': 'Date', 'count': 'Number of Comments'})
fig.show()

print("✅ Visualizations completed!")


✅ Visualizations completed!
