In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from scikit_posthocs import posthoc_dunn
from statannotations.Annotator import Annotator
import matplotlib.pyplot as plt
import seaborn as sns

Find average sentiment of comments in each HN story

In [None]:
def average_comments_sentiment(df_comments,df_stories):
    return df_comments.groupby('story_id')['comment_sentiment'].mean()

comments = pd.read_csv('hn_gh_ai_comment_sentiment.csv')
stories = pd.read_csv('hn_gh_ai_story_sentiment.csv')
average = average_comments_sentiment(comments,stories)

print('unique story ids in hn_gh_ai_story_sentiment.csv: ' + str(len(stories)))
print('unique story ids in hn_gh_ai_comment_sentiment.csv: ' + str(len(average)))


average_sentiments = []
for i in stories['discussion_id']:
    if average.get(i) != None:
        average_sentiments.append(average.get(i))
    else:
        average_sentiments.append(-2)

stories['average_comments_sentiment'] = average_sentiments
stories['overall_comments_sentiment'] = stories['average_comments_sentiment'].apply(lambda x: -2 if x < -1 else(-1 if x < -0.5 else( 1 if x > 0.5 else 0)))
stories.to_csv('sentiments_v3.csv', index=False)

Functions for loading and categorizing csv files

In [None]:
#Load CSV files
def load_csv_file(filePath):
    df = pd.read_csv(filePath)
    df = df.drop_duplicates(subset=['discussion_id']) #Remove duplicated stories
    #print(df)
    return df

#Categorize sentiment groups and save them into separate files

def categorizer(df, save_to_file = True):
    neg = df[df['overall_comments_sentiment'] == -1]
    neu = df[df['overall_comments_sentiment'] == 0]
    pos = df[df['overall_comments_sentiment'] == 1]
    print('Categorize result:')
    print(f"Negative: {len(neg)}")
    print(f"Neutral: {len(neu)}")
    print(f"Positive: {len(pos)}")
    print(f"Total: {len(df)}")

    if save_to_file:
        pos.to_csv('Positive_stories.csv', index=False)
        neu.to_csv('Neutral_stories.csv', index=False)
        neg.to_csv('Negative_stories.csv', index=False)


Join sentiment file and GitHub metrics file and categorize them based on sentiment

In [None]:
senti_df = load_csv_file('sentiments_v3.csv')
senti_df = senti_df[['discussion_id','average_comments_sentiment','overall_comments_sentiment','url']]
print('sentiment_df stories: ' + str(len(senti_df)))
print('sentiment_df stories with comment: ' + str(len(senti_df[(senti_df != -2).all(1)])))
print('sentiment_df stories (no dupe): ' + str(len(senti_df.sort_values('average_comments_sentiment', ascending=False).drop_duplicates(subset=['url'], keep='first'))))

senti_df = senti_df.sort_values('average_comments_sentiment', ascending=False).drop_duplicates(subset=['url'], keep='first')
senti_df = senti_df[(senti_df != -2).all(1)]
print('sentiment_df stories with comment (no dupe): ' + str(len(senti_df)))

metric_df = load_csv_file('hn-stories-gh-ai-metrics-5months-[no-dupes]-v4-monthly-new.csv')
print('metric_df stories: ' + str(len(metric_df)))

join_df = metric_df.merge(senti_df.set_index('url'), on='url', how='inner')
print('After join: ' + str(len(join_df)))

join_df = join_df[(join_df != -2).all(1)]
print('Remove stories with no comment: ' + str(len(join_df)))

join_df.to_csv('join_result-v4.csv', index=False)
#join_df


Calculating functions

In [None]:
#Calculate distance of a metric between 2 time periods
def calculate_dist(df, value1, value2):
    return df[value2] - df[value1]

#Create a column for distance of metric
def create_dist(df, value1, value2, result_name):
    df[result_name] = calculate_dist(df, value1, value2)

#Calculate percent change of a metric between 2 time periods
def calculate_percentage(df,value1,value2):
    return ((df[value2]/df[value1]) * 100) - 100 

#Create a column for percent change of metric
def create_percentage(df,value1,value2, result_name):
    percent_result = calculate_percentage(df,value1,value2)
    #Replace inf value with 0
    percent_result.replace([np.inf, -np.inf], np.nan, inplace=True)
    percent_result.fillna(0, inplace=True)
    df[result_name] = percent_result.apply(
        lambda x: 0 if x == np.nan else x
    )

#Run all function above for all metrics
def create_dist_percent(df,period1, period2):
    create_dist(df, 'commits_' + period1, 'commits_' + period2, 'dist_commits(' + period1 + '-'+period2+')')
    create_dist(df, 'pull_requests_' + period1, 'pull_requests_'+ period2, 'dist_pull_requests(' + period1 + '-'+period2+')')
    create_dist(df, 'stars_' + period1, 'stars_'+ period2, 'dist_stars(' + period1 + '-'+period2+')')
    create_dist(df, 'forks_' + period1, 'forks_'+ period2, 'dist_forks(' + period1 + '-'+period2+')')
    create_dist(df, 'contributors_' + period1, 'contributors_'+ period2, 'dist_contributors(' + period1 + '-'+period2+')')
    create_percentage(df, 'commits_'+period1, 'commits_'+ period2, 'percent_commits(' + period1 + '-'+period2+')')
    create_percentage(df, 'pull_requests_'+period1, 'pull_requests_'+ period2, 'percent_pull_requests(' + period1 + '-'+period2+')')
    create_percentage(df, 'stars_'+period1, 'stars_'+ period2, 'percent_stars(' + period1 + '-'+period2+')')
    create_percentage(df, 'forks_'+period1, 'forks_'+ period2, 'percent_forks(' + period1 + '-'+period2+')')
    create_percentage(df, 'contributors_'+period1, 'contributors_'+ period2, 'percent_contributors(' + period1 + '-'+period2+')')

#Accumulate metric value between each month
def accumulate_metrics(df,metric_of_interest, amount_of_months):
    for j in df['url']:
        cumulative_value = df.loc[df['url'] == j,metric_of_interest + '_at_submission']
        for i in range(1, amount_of_months+1):
            cumulative_value += df[metric_of_interest + '_month_' +str(i)]
            df.loc[df['url'] == j,metric_of_interest + '_month_' +str(i)] = cumulative_value
    df.to_csv('join_result-v4.csv', index=False)

#Accumulate all metric value between each month
def accumulate_all_metrics(df, amount_of_months):
    accumulate_metrics(df,'stars',amount_of_months)
    accumulate_metrics(df,'forks',amount_of_months)
    accumulate_metrics(df,'commits',amount_of_months)
    accumulate_metrics(df,'pull_requests',amount_of_months)
    accumulate_metrics(df,'contributors',amount_of_months)

create_dist_percent(join_df,'at_submission','month_1')
create_dist_percent(join_df,'month_1','month_2')
create_dist_percent(join_df,'month_2','month_3')
create_dist_percent(join_df,'month_3','month_4')
create_dist_percent(join_df,'month_4','month_5')

accumulate_all_metrics(join_df,5)

Add sentiment column which use 'Positive, Neutral, Negative' for classification (Original sentiment value use '1,0,-1')

In [None]:
join_df['sentiment'] = join_df['overall_comments_sentiment'].apply(
        lambda x: 'Positve' if x == 1 else ('Neutral' if x == 0 else 'Negative')
    )

categorizer(join_df)
pos = pd.read_csv('Positive_stories.csv')
neu = pd.read_csv('Neutral_stories.csv')
neg = pd.read_csv('Negative_stories.csv')

Calculate and display means of metric values

In [None]:
def calculate_metric_mean(df,metric_of_interest):
    result = []
    result.append(df[metric_of_interest+'_at_submission'].mean())
    for i in range(1,6):
        result.append(df[metric_of_interest+'_month_'+str(i)].mean())
    return result

def display_metric_mean(df,metric_of_interest):
    print('Mean of ' + metric_of_interest + ' each month:')
    #print('  Submission: %.3f' % df[metric_of_interest + '_at_submission'].mean())
    for i in range(1,6):
        print('  Month ' + str(i) + ': %.3f' % df[metric_of_interest+'_month_'+str(i)].mean())

def display_all_metric_mean(df):
    display_metric_mean(df,'commits')
    display_metric_mean(df,'contributors')
    display_metric_mean(df,'stars')
    display_metric_mean(df,'forks')
    display_metric_mean(df,'pull_requests')

print('Positive Group')
display_all_metric_mean(pos)
print('\nNeutral Group')
display_all_metric_mean(neu)
print('\nNegative Group')
display_all_metric_mean(neg)


Display seaborn plot

In [None]:
def sns_box_plot(df, metric_of_interest, save = False):
    #Melt data
    mdf = pd.melt(df,id_vars=['url'], value_name='distance',value_vars=[metric_of_interest+'_month_1', metric_of_interest+'_month_2', metric_of_interest+'_month_3', metric_of_interest+'_month_4', metric_of_interest+'_month_5'])
    mdf['variable'] = mdf['variable'].apply(
        lambda x: 'm 1' if x == metric_of_interest+'_month_1' else ('m 2' if x == metric_of_interest+'_month_2' else ('m 3' if x==metric_of_interest+'_month_3' else ('m 4' if x == metric_of_interest+'_month_4' else 'm 5')))
    )
    senti_df = df[['url','sentiment']]
    mdf = mdf.join(senti_df.set_index('url'), on='url')
    mdf = mdf.drop('url', axis=1)

    ax = sns.boxplot(x=mdf['variable'], y=mdf['distance'], hue=mdf['sentiment'], native_scale=True, palette=[ 'skyblue', 'lightgreen', 'tomato'], showfliers=False, showmeans=True)
    ax = sns.pointplot(x=mdf['variable'], y=mdf['distance'], hue=mdf['sentiment'],dodge=.55 , errorbar=None,palette=['skyblue', 'lightgreen',  'tomato'], ax=ax)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
    ax.legend([], [], frameon=False)
    plt.title(None)
    if save:
        plt.savefig(metric_of_interest+'.pdf')
    plt.show()

def display_plot_all(save = False):
    sns_box_plot(join_df,'commits', save=save)
    sns_box_plot(join_df,'pull_requests', save=save)
    sns_box_plot(join_df,'stars',save=save)
    sns_box_plot(join_df,'forks',save=save)
    sns_box_plot(join_df,'contributors',save=save)

display_plot_all(save=True)

## Unused codes

 Kruskal Wallis test v2

In [None]:
def Kruskal_test(pos_group, neu_group, neg_group, metric_of_interest, print_result = True, return_value=False):
    pos_group = pos_group[metric_of_interest]
    neu_group = neu_group[metric_of_interest]
    neg_group = neg_group[metric_of_interest]
    statistic, p_value = stats.kruskal(pos_group,neu_group,neg_group)
    reject_null =  p_value < 0.05

    if print_result:
        print('Result of statistical on ' + metric_of_interest + ":")
        print(f"Statistic result: {statistic}")
        print(f"p-value: {p_value}")
        if reject_null:
            print("< alpha(0.05): True\n" )
        else:
            print("< alpha(0.05): False\n")

    if return_value:
        return{
            'statistic': statistic,
            'p_value': p_value
        }


In [None]:
def get_molten(df,metric_of_interest,group):
    dunn_df = posthoc_dunn(
        df, val_col=metric_of_interest, group_col=group, p_adjust='fdr_bh'
    )

    remove = np.tril(np.ones(dunn_df.shape), k=0).astype("bool")
    dunn_df[remove] = np.nan
    molten_df = dunn_df.melt(ignore_index=False).reset_index().dropna()

    return molten_df

Apply to statistical test to plot

In [None]:
def sns_box_plot_statistical(df, metric_of_interest, month_amount = ' (1 month)'):
    ax = sns.boxplot(x=df["sentiment"], y=df[metric_of_interest], native_scale=True, palette=['lightgreen', 'skyblue', 'tomato'], showfliers=False)
    plt.title(metric_of_interest + ' after HN submission' + month_amount)

    molten_df = get_molten(df,metric_of_interest,'sentiment')
    pairs = [(i[1]["index"], i[1]["variable"]) for i in molten_df.iterrows()]
    p_values = [i[1]["value"] for i in molten_df.iterrows()]

    annotator = Annotator(
        ax, pairs, data=df, x="sentiment", y=metric_of_interest
    )

    annotator.configure(text_format="star",loc="inside")
    annotator.set_pvalues_and_annotate(p_values)

    plt.tight_layout()
    plt.show()

sns_box_plot_statistical(join_df,"percent_commits")
sns_box_plot_statistical(join_df,"percent_forks")

Display plot (Non-sns)

In [None]:
def box_plot(pos_,neu_,neg_, metric_of_interest):
    senti_groups = [pos_[metric_of_interest],neu_[metric_of_interest],neg_[metric_of_interest]]

    labels = ['Positive', 'Neutral', 'Negative']
    colors = ['lightgreen', 'skyblue', 'tomato']

    fig, ax = plt.subplots()
    fig.suptitle( metric_of_interest + ' after HN submission')
    ax.set_ylabel(metric_of_interest)

    bplot = ax.boxplot(senti_groups,
                   patch_artist=True,  # fill with color
                   tick_labels=labels)  # will be used to label x-ticks

    # fill with colors
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)

    plt.show()

Normality Test (Shapiro-Wilk)

In [None]:
def test_normality(data):
    statistic, p_value = stats.shapiro(data)
    #print("Normality Test Results:")
    print(f"Is normal distribution? {'Yes' if p_value > 0.05 else 'No'}")
    print(f"P-value: {p_value}\n")
    return {
        'statistic': statistic,
        'p_value': p_value,
        'is_normal': p_value > 0.05
    }

#rng = np.random.default_rng()
#x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)

print("Normality test for stars")
normality_result = test_normality(join_df['stars'])
print("Normality test for forks")
normality_result = test_normality(join_df['forks'])
print("Normality test for total_commits")
normality_result = test_normality(join_df['total_commits'])
print("Normality test for total_issues")
normality_result = test_normality(join_df['total_issues'])
print("Normality test for total_contributors")
normality_result = test_normality(join_df['total_contributors'])
print("Normality test for total_prs")
normality_result = test_normality(join_df['total_prs'])
print("Normality test for pr_contributors")
normality_result = test_normality(join_df['pr_contributors'])



Old pyplot code to display frequency on raw metric values

In [None]:
def plot_distribution(metric_of_interest):
    fig, axs = plt.subplots(sharey=True, tight_layout=True)
    axs.hist(join_df[metric_of_interest], bins=100)
    fig.suptitle(metric_of_interest)
    plt.show()

plot_distribution('stars')
plot_distribution('forks')
plot_distribution('total_commits')
plot_distribution('total_issues')
plot_distribution('total_contributors')
plot_distribution('total_prs')
plot_distribution('pr_contributors')

Statistical test function (Kruskal Wallis H Test) V1

Statistical tests between 3 sentiment groups (positive, neutral, negative) on each GitHub metric

In [None]:
pos = pd.read_csv('Positive_stories.csv')
neu = pd.read_csv('Neutral_stories.csv')
neg = pd.read_csv('Negative_stories.csv')

#print(stats.kruskal(pos['stars'],neu['stars'],neg['stars']))

Kruskal_test(pos,neu,neg,'stars')
Kruskal_test(pos,neu,neg,'forks')
Kruskal_test(pos,neu,neg,'total_commits')
Kruskal_test(pos,neu,neg,'total_issues')
Kruskal_test(pos,neu,neg,'total_prs')
Kruskal_test(pos,neu,neg,'total_contributors')
Kruskal_test(pos,neu,neg,'pr_contributors')

Display plot of repos frequency on raw metric values with sentiment grouping v1

In [None]:
def plot_metric(pos, neu, neg, metric_of_interest):
    fig, axs = plt.subplots(nrows=2, ncols=2)
    axs[0,0].hist(pos[metric_of_interest], facecolor='g',alpha=0.75)
    axs[0, 0].set_title('Positive')
    axs[0,1].hist(neu[metric_of_interest], facecolor='b',alpha=0.75)
    axs[0, 1].set_title('Neutral')
    axs[1,0].hist(neg[metric_of_interest], facecolor='r', alpha=0.75)
    axs[1,0].set_title('Negative')

    fig.suptitle(metric_of_interest)
    fig.tight_layout(pad=1.1)

    plt.show()

Display plot of repos frequency on raw metric values with sentiment grouping v2

In [None]:
def plot_metric(pos, neu, neg, metric_of_interest):
    metric = [
        pos[metric_of_interest],
        neu[metric_of_interest],
        neg[metric_of_interest]
    ]
    labels = ['Positive', 'Neutral', 'Negative']
    colors = ['lightgreen', 'skyblue', 'coral']

    fig, ax = plt.subplots()
    ax.set_ylabel(metric_of_interest)

    bplot = ax.boxplot(metric,patch_artist=True,  tick_labels=labels)

# fill with colors
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)
    plt.show()

In [None]:
plot_metric(pos,neu,neg, 'stars')
plot_metric(pos,neu,neg, 'forks')
plot_metric(pos,neu,neg, 'total_commits')
plot_metric(pos,neu,neg, 'total_issues')
plot_metric(pos,neu,neg, 'total_contributors')
plot_metric(pos,neu,neg,'total_prs')
plot_metric(pos,neu,neg,'pr_contributors')

In [None]:
#ANOSIM test example

import numpy as np
from skbio.stats.distance import anosim
from skbio.stats.distance import DistanceMatrix
from skbio import io

# Example data: distance matrix and grouping (labels) for samples
# The distance matrix is a square matrix where the element at [i, j] is the distance between sample i and sample j
distance_data = np.array([
    [0.0, 1.0, 0.5, 1.5],
    [1.0, 0.0, 1.2, 1.8],
    [0.5, 1.2, 0.0, 1.3],
    [1.5, 1.8, 1.3, 0.0]
])

# Labels for grouping of the samples
labels = ['Group1', 'Group1', 'Group2', 'Group2']

# Create a DistanceMatrix object from the distance matrix
dist_matrix = DistanceMatrix(distance_data, ids=['Sample1', 'Sample2', 'Sample3', 'Sample4'])

# Perform ANOSIM test
anosim_result = anosim(dist_matrix, labels)

# Output the ANOSIM result
print(anosim_result)
#print("ANOSIM Statistic (R):", anosim_result.statistic)
#print("p-value:", anosim_result.p_value)
