# setup

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, mannwhitneyu, kruskal
from datetime import datetime

# Load your data
HN_STORIES_GH_CSV = '../../data/sampled_full_hn_comments.csv' # after converting raw json to raw csv
df = pd.read_csv(HN_STORIES_GH_CSV)

# Convert dates to datetime objects
df['discussion_date'] = pd.to_datetime(df['discussion_date'])
df['comment_date'] = pd.to_datetime(df['comment_date'])

# Add a "sentiment_period" column to differentiate before and after ChatGPT release
chatgpt_release = datetime(2022, 11, 30)
df['sentiment_period'] = np.where(df['comment_date'] < chatgpt_release, 'Before ChatGPT', 'After ChatGPT')

# Check the data structure
df.head()


Unnamed: 0,discussion_id,title,url,discussion_date,comment_id,parent_id,depth,comment_text,comment_date,comment_author,sentiment_period
0,31533180,Codeball – AI-powered code review,https://codeball.ai/,2022-05-28 01:38:48,31534424,,0,Explanation of results for non-ML folks (resul...,2022-05-28 03:19:40,apugoneappu,Before ChatGPT
1,31533180,Codeball – AI-powered code review,https://codeball.ai/,2022-05-28 01:38:48,31535661,,0,I'm a bit skeptical here. We should ask the qu...,2022-05-28 05:30:51,gombosg,Before ChatGPT
2,31533180,Codeball – AI-powered code review,https://codeball.ai/,2022-05-28 01:38:48,31535295,,0,I would never use something like this. Seems t...,2022-05-28 04:44:55,donkarma,Before ChatGPT
3,31533180,Codeball – AI-powered code review,https://codeball.ai/,2022-05-28 01:38:48,31535829,,0,I think I like this better expressed as a lint...,2022-05-28 05:49:39,mchusma,Before ChatGPT
4,31533180,Codeball – AI-powered code review,https://codeball.ai/,2022-05-28 01:38:48,31533864,,0,"Creator of Codeball here, somebody beat us to ...",2022-05-28 02:32:12,videlov,Before ChatGPT


descriptive statistics and visualization

In [3]:
# Count plot for sentiment distribution
plt.figure(figsize=(10, 5))
sns.countplot(x='model_prediction', data=df, palette='coolwarm')
plt.title('Overall Sentiment Distribution')
plt.savefig('sentiment_distribution.png')

# Sentiment distribution before and after ChatGPT
plt.figure(figsize=(10, 5))
sns.countplot(x='model_prediction', hue='sentiment_period', data=df, palette='coolwarm')
plt.title('Sentiment Distribution Before and After ChatGPT Release')
plt.savefig('sentiment_distribution_before_after.png')


ValueError: Could not interpret value `model_prediction` for `x`. An entry with this name does not appear in `data`.

<Figure size 1000x500 with 0 Axes>

time series plot of average sentiment over time

In [None]:
# Create a column for average sentiment per week
df['week'] = df['comment_date'].dt.to_period('W')
weekly_sentiment = df.groupby('week')['model_prediction'].mean().reset_index()
weekly_sentiment['week'] = weekly_sentiment['week'].dt.start_time

# Plot average sentiment over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='week', y='model_prediction', data=weekly_sentiment, marker='o')
plt.axvline(chatgpt_release, color='red', linestyle='--', label='ChatGPT Release')
plt.title('Weekly Average Sentiment Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sentiment')
plt.legend()
plt.savefig('weekly_avg_sentiment.png')


normality test

In [None]:
# Shapiro-Wilk Test for normality
before_chatgpt = df[df['sentiment_period'] == 'Before ChatGPT']['model_prediction']
after_chatgpt = df[df['sentiment_period'] == 'After ChatGPT']['model_prediction']

shapiro_before = shapiro(before_chatgpt)
shapiro_after = shapiro(after_chatgpt)

print("Shapiro-Wilk Test - Before ChatGPT:", shapiro_before)
print("Shapiro-Wilk Test - After ChatGPT:", shapiro_after)


comparing sentiment before and after chatgpt release

In [None]:
# Mann-Whitney U Test for sentiment difference before/after ChatGPT release
u_stat, p_value = mannwhitneyu(before_chatgpt, after_chatgpt, alternative='two-sided')
print(f"Mann-Whitney U Test:\n U statistic = {u_stat}, p-value = {p_value}")


yearly sentiment comparison (kruskal-wallis test)

In [None]:
# Create a year column for yearly analysis
df['year'] = df['comment_date'].dt.year
yearly_sentiment = df.groupby('year')['model_prediction'].mean()

# Kruskal-Wallis test across years
years = [2022, 2023, 2024]
sentiments_by_year = [df[df['year'] == year]['model_prediction'] for year in years]
kruskal_stat, kruskal_p = kruskal(*sentiments_by_year)
print(f"Kruskal-Wallis Test across years:\n Statistic = {kruskal_stat}, p-value = {kruskal_p}")

# Bar plot for yearly average sentiment
plt.figure(figsize=(8, 5))
yearly_sentiment.plot(kind='bar', color='skyblue')
plt.title('Yearly Average Sentiment')
plt.ylabel('Average Sentiment')
plt.savefig('yearly_avg_sentiment.png')


rolling mean for sentiment

In [None]:
# Calculate and plot rolling mean for sentiment
weekly_sentiment['rolling_mean'] = weekly_sentiment['model_prediction'].rolling(window=4).mean()

plt.figure(figsize=(12, 6))
sns.lineplot(x='week', y='rolling_mean', data=weekly_sentiment, marker='o', color='purple')
plt.axvline(chatgpt_release, color='red', linestyle='--', label='ChatGPT Release')
plt.title('Weekly Rolling Mean Sentiment (4-week window)')
plt.xlabel('Date')
plt.ylabel('Rolling Mean Sentiment')
plt.legend()
plt.savefig('rolling_mean_sentiment.png')
