# setup

In [None]:
NUM_KIDS = 100 # per story

CHATGPT_PROGRESS_FILENAME = './chatgpt_progress.json'
CHATGPT_COMMENTS_PROGRESS_FILENAME = './chatgpt_comments_progress.json'
CHATGPT_COMMENTS_SENTIMENT_FILENAME = './test_sentiment.csv'

KEYWORDS_FILENAME = '../ai_keywords.txt'

# chatgpt_gh_filename = 'github_links_chatgpt.json'

# Define the base URL for the Hacker News API
BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# CHATGPT_RELEASE_ID = 33804874
# START_ID = 31300000 # may 8th 2022
# END_ID = 40300000 # may 9th 2024

DEPTH = 2 # comments depth

# aggregating comments into discussions' sentiments

## methods for aggregation
- summarize each comments' sentiment as +ve/-ve/neut, then count, then choose max

- find the average +ve/-ve/neut for each discussion, then choose max

## from chatgpt

In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv(CHATGPT_COMMENTS_SENTIMENT_FILENAME)

# Function to check if sentiment data is available
def has_sentiment(row):
    return not pd.isna(row['negative']) and not pd.isna(row['neutral']) and not pd.isna(row['positive'])

# Method 1: Highest count of majority class
def method_1(group):
    valid_rows = group[group.apply(has_sentiment, axis=1)]
    if valid_rows.empty:
        return 'insufficient_data'
    majority_sentiments = valid_rows[['negative', 'neutral', 'positive']].idxmax(axis=1)
    sentiment_counts = majority_sentiments.value_counts()
    return sentiment_counts.index[0]

# Method 2: Highest average percentage
def method_2(group):
    valid_rows = group[group.apply(has_sentiment, axis=1)]
    if valid_rows.empty:
        return 'insufficient_data'
    avg_sentiments = valid_rows[['negative', 'neutral', 'positive']].mean()
    return avg_sentiments.idxmax()

# Method 3: Weighted average
def method_3(group):
    valid_rows = group[group.apply(has_sentiment, axis=1)]
    if valid_rows.empty:
        return 'insufficient_data'
    weights = valid_rows['post_text'].str.len()
    weighted_avg = (valid_rows[['negative', 'neutral', 'positive']] * weights[:, np.newaxis]).sum() / weights.sum()
    return weighted_avg.idxmax()

# Apply methods
results = df.groupby('discussion_id').agg({
    'title': 'first',
    'sentiment_method_1': lambda x: method_1(x),
    'sentiment_method_2': lambda x: method_2(x),
    'sentiment_method_3': lambda x: method_3(x)
})

print(results.head())

# Optional: Count discussions with insufficient data
insufficient_data_count = (results == 'insufficient_data').sum()
print("\nDiscussions with insufficient data:")
print(insufficient_data_count)

TypeError: has_sentiment() got an unexpected keyword argument 'axis'

# checking comments sentiment distribution significance

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Load the data
df = pd.read_csv(CHATGPT_COMMENTS_SENTIMENT_FILENAME)

# Function to check if sentiment data is available
def has_sentiment(row):
    return not pd.isna(row['negative']) and not pd.isna(row['neutral']) and not pd.isna(row['positive'])

# Filter out rows with missing sentiment data
df_valid = df[df.apply(has_sentiment, axis=1)]

# Function to get the majority sentiment
def get_majority_sentiment(row):
    sentiments = ['negative', 'neutral', 'positive']
    return max(sentiments, key=lambda s: row[s])

# Calculate observed frequencies
observed = df_valid[['negative', 'neutral', 'positive']].idxmax(axis=1).value_counts()

# Calculate total number of comments
total_comments = observed.sum()

# Calculate expected frequencies (assuming equal distribution)
expected = pd.Series({
    'negative': total_comments / 3,
    'neutral': total_comments / 3,
    'positive': total_comments / 3
})

# Perform chi-square test
chi2, p_value, dof, expected = chi2_contingency([observed, expected])

print("Observed frequencies:")
print(observed)
print("\nExpected frequencies:")
print(expected)
print(f"\nChi-square statistic: {chi2}")
print(f"p-value: {p_value}")

# Interpret the results
alpha = 0.05  # significance level
if p_value < alpha:
    print("\nThe sentiment distribution is significantly different from random (reject null hypothesis)")
else:
    print("\nThe sentiment distribution is not significantly different from random (fail to reject null hypothesis)")

# Optional: Calculate effect size (Cramer's V)
n = total_comments
min_dim = min(observed.shape[0], 2) - 1  # 2 because we're comparing to expected
cramer_v = np.sqrt(chi2 / (n * min_dim))
print(f"\nEffect size (Cramer's V): {cramer_v}")

Observed frequencies:
neutral     5823
negative    4082
positive    2076
Name: count, dtype: int64

Expected frequencies:
[[4908.33333333 4037.83333333 3034.83333333]
 [4908.33333333 4037.83333333 3034.83333333]]

Chi-square statistic: 947.7347279114595
p-value: 1.592277213018004e-206

The sentiment distribution is significantly different from random (reject null hypothesis)

Effect size (Cramer's V): 0.28125280555392784
