In [None]:
input_files = {}
distribution_parameters = {}
#WARNING: When re-running the notebook for audit, change the injected path below to "./output_praiseDist_test.ipynb"
#then go to "Cell > Run all" -- This only works for the notebook in 
#"distribution_results/round ?/results/analysis_outputs/output_general_RAD_report.ipynb"

In [None]:
try:
    categ_keywords = distribution_parameters['categorization_settings']['type_keywords']
except:
    #categ_keywords = {'attendance':'join|attend|show up|participat','discussion':'question|ask|discuss|discussion','work':'help|work|design|make|write|hack|edit','lead':'host|lead|initiate|form|organize|steward','share':'share|spread','twitter':'twitter|tweet','hack':'hack|test','general':'support|awesome','IRL':'trip|conference'}
    categ_keywords = {'attendance':'join|attend|show up|coming to','discussion':'question|ask|discuss|discussion|insight|participat|feedback|observations|forum|comment|share','work':'work|writing|hack|edit|studying|research|drafting|mediat|recording|taking notes|note taking|develop|analysis','lead':'host|lead|initiat|organizing|organizer|steward|forming|facilitate|managing','share':'share|spread','social media':'twitter|tweet|retweet|socials|social media','hardskills':'design|bug fixes|deploy|prototype|coding|python|javascript|solidity|data science','self-care':'vacation|time off|time-off|holiday|sabbatical|day off','IRL':'trip|DAOist|ETHcc|barcelona|denver|paris|amsterdam|colombia'}


In [None]:
try:
    NUMBER_OF_WEEKS = distribution_parameters["cross_period_settings"]["cross_period_week_num"]
    STEP_SIZE = distribution_parameters["cross_period_settings"]["cross_period_step_size"]
except:
    NUMBER_OF_WEEKS = 4
    STEP_SIZE = 1
    print(f'Using default time period: the most recent {NUMBER_OF_WEEKS} weeks, looking into every {STEP_SIZE} week.')

In [None]:

import os
import sys

import pandas as pd 
import numpy as np 
from collections import OrderedDict
from natsort import natsorted
from datetime import datetime, timedelta

import holoviews as hv
from holoviews import opts
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt

import base64
from IPython.display import HTML
from IPython.display import Markdown as md

import scrapbook as sb

from re import search

#adding directories for the analysis tool. this is mainly for when we re-run the notebook 
dir2 = os.path.abspath('../../../../../rad/analysis_tools')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)
from analysis_tools.module_libraries import praise_analysis_module as praise_tools


In [None]:
def split_into_weeks(data, num_weeks, step_size):
    data = data.sort_values(by='DATE', ascending = True).reset_index()
    data["DATE"] = pd.to_datetime(data["DATE"])
    
    first_date = pd.to_datetime(data.at[len(data.index) - 1, "DATE"]) - timedelta(weeks=(num_weeks))
    
    roundname_list = []

    rounds_df = {}
    for week in range(0, num_weeks, step_size):
        week_id = "Week " + str(week+1)
        roundname_list.append(week_id)
        rounds_df[week_id]= []
        start_date = first_date + timedelta(weeks=(week))
        end_date = first_date + timedelta(weeks=(week+step_size))
        
        rounds_df[week_id] =  data.loc[(data['DATE'] >= start_date) & (data['DATE'] <= end_date)]

                
    return rounds_df, roundname_list
    

In [None]:
datadir = input_files["cross_period_root"]
foldername_list = natsorted(os.listdir(datadir))

allrounds_df = []
allrounds_finaldist = []

rounds = 0
for round_name in foldername_list:
    if not os.path.isdir(f'{datadir}/{round_name}'):
        foldername_list.remove(round_name)
        continue
    rounds+=1
    round_df = pd.read_csv(f'{datadir}/{round_name}/distribution_results/raw_csv_exports/praise_outliers.csv')
    dist_df = pd.read_csv(f'{datadir}/{round_name}/distribution_results/raw_csv_exports/extended_praise_data.csv')

    
    for index, row in round_df.iterrows():
        #print(row)
        row['QUANT ROUND']=round_name
        allrounds_df.append(row)
    for index, row in dist_df.iterrows():
        row['QUANT_ROUND']=round_name
        allrounds_finaldist.append(row)
        
    
allrounds_df = pd.DataFrame(allrounds_df)
allrounds_finaldist = pd.DataFrame(allrounds_finaldist)

In [None]:

allrounds_df.drop("index", axis =1,  inplace=True)
allrounds_df.fillna(value={"QUANTIFIER 4 USERNAME": "None"}, inplace=True)
master_allrounds = allrounds_df.copy()
week_df, roundname_list = split_into_weeks(allrounds_df, NUMBER_OF_WEEKS, STEP_SIZE)
allrounds_df = week_df



In [None]:

finaldist_weekly, roundname_list = split_into_weeks(allrounds_finaldist, NUMBER_OF_WEEKS, STEP_SIZE)
for period in roundname_list:

    period_dist = finaldist_weekly[period].copy()
    period_dist = period_dist[['TO USER ACCOUNT ID', 'DATE', 'TO USER ACCOUNT', 'TOKEN TO RECEIVE']].copy().groupby(['TO USER ACCOUNT ID', 'TO USER ACCOUNT',]).agg('sum').reset_index()
    period_dist.rename(columns = {'TOKEN TO RECEIVE': 'PRAISE REWARDS'}, inplace = True)
    
    period_dist = period_dist.sort_values(by='PRAISE REWARDS', ascending=False).reset_index()
    period_dist.drop("index", axis =1,  inplace=True)
    
    finaldist_weekly[period] = period_dist
    
allrounds_finaldist = finaldist_weekly


# Cross-Period Analysis Report
This report aims to offer a perspective on the activity inside the praise system over several rounds.

In [None]:
md(f"This report will cover <b>{NUMBER_OF_WEEKS}</b> weeks, divided into blocks of <b>{STEP_SIZE}</b> weeks each.")


# General Statistics
The full range will be subdivided into the following periods:

In [None]:
round_stats = pd.DataFrame(index=allrounds_df.keys())

In [None]:
round_stats['period_start_time'] =  [str(allrounds_df[round_name]['DATE'].min())[:10] for round_name in roundname_list]
round_stats['period_end_time'] =  [str(allrounds_df[round_name]['DATE'].max())[:10] for round_name in roundname_list]

In [None]:
round_stats

## Praise Involvement

### How much praise? 
This graph shows the trend of total number of praise instances across time.

In [None]:
round_stats['total_praise'] = [len(allrounds_df[round_name]) for round_name in roundname_list]
px.line(round_stats,x='period_start_time',y='total_praise',markers=True)


### How many people give and receive praise?
Counting the unique ID of praise givers and receivers, we can visualize the change across time. In the figure, the blue line represents the amount of praise receivers and thered line the amount of givers.

In [None]:
round_stats['total_praise_receiver'] = [len(np.unique(allrounds_df[round_name]['TO USER ACCOUNT'])) for round_name in roundname_list]
round_stats['total_praise_giver'] = [len(np.unique(allrounds_df[round_name]['FROM USER ACCOUNT'])) for round_name in roundname_list]

px.line(round_stats,x='period_start_time',y=['total_praise_receiver','total_praise_giver'],markers=True,title='total praise giver and receiver')


## Quantifier Involvement
Showing how many quantifiers are involved in each round.

In [None]:
#this metric has to be kept in a per-round basis, since quntifiers are assigned in that rythm
try:
    quant_stats = pd.DataFrame()
    quant_stats["quant_round"] = foldername_list
    quant_stats['total_quantifier'] = [len(np.unique(master_allrounds[master_allrounds['QUANT ROUND'] == round_name].filter(like='QUANTIFIER'))) for round_name in foldername_list]
    px.line(quant_stats,x="quant_round",y=['total_quantifier'],markers=True,title='total quantifiers')
except:
    print("Exception encountered while processing quantifiers")

### Quantifier trend

In [None]:
try:
    pr=praise_tools.praise_quantifier(master_allrounds)
except:
    print("Exception encountered while processing quantifiers")

### average score displacement: tendency to under/over-estimate?

In [None]:
try:
    pr.plot_mean_displacement()
except:
    print("Exception encountered while processing quantifiers")

### average score correlation coefficient: how much do i agree with other people?

In [None]:
try:
    pr.plot_coefficient()
except:
    print("Exception encountered while processing quantifiers")

# System Health Evaluation


## Number of new Giveth members involved in praise (either giving or receiving)
Counting the round-by-round change of unique IDs being either praise giver or praise receiver.

In [None]:
try:
    round_stats['round_user_list'] = [set(np.unique(allrounds_df[round_name].filter(like='ACCOUNT')))
            .union(set(np.unique(allrounds_df[round_name].filter(like='QUANTIFIER')))) for round_name in roundname_list]
except:
    print("Error in quantifiers, skipping them for analysis")
    round_stats['round_user_list'] = [set(np.unique(allrounds_df[round_name].filter(like='ACCOUNT'))) for round_name in roundname_list]
    

In [None]:
round_stats['round_user_new'] = [np.nan]+ [len(round_stats.loc[roundname_list[kr+1],'round_user_list'] - 
                                         round_stats.loc[roundname_list[kr],'round_user_list']) for kr in np.arange(len(roundname_list)-1)]

round_stats['round_user_left'] = [np.nan]+[len(round_stats.loc[roundname_list[kr],'round_user_list'] - 
                                         round_stats.loc[roundname_list[kr+1],'round_user_list']) for kr in np.arange(len(roundname_list)-1)]

In [None]:
round_stats['round_net_user_diff']=round_stats['round_user_new']-round_stats['round_user_left']

The blue line represents new IDs in this round, the red line represents IDs that are absent in this round but were present in the last round. The green line shows the net difference, with above 0 meaning more people joined praise than people left and below 0 meaning the opposite.

In [None]:
px.line(round_stats,x='period_start_time',y=['round_user_new','round_user_left','round_net_user_diff'])

## Distribution Equality

### Nakamoto Coefficient

The Nakamato Coefficient is defined as the smallest number of accounts who control at least 50% of the resource. Although its significance relates to the prospect of a 51% attack on a network, which may not be relevant in our context, we can still use it as an intuitive measure of how many individuals received the majority of rewards.

Bigger coefficient means more distributed (i.e. needs more people to pass 50%), smaller means more concentrated power. The number should always be an integer.

In [None]:
def nakamoto_coeff(x, key):
    value_sum= x[key].sum()
    x['PERCENTAGE'] = x[key] / value_sum
    sorted_x = x.sort_values(by='PERCENTAGE', ascending=False)
    tot_sum = np.array(sorted_x['PERCENTAGE'].cumsum())
    try:
        winner = np.array([k for k in range(len(tot_sum))
                          if tot_sum[k] > 0.5]).min() + 1
    except:
        winner = -1
    return winner
def nakamoto_coeff_ratio(x, key):
    winner = nakamoto_coeff(x, key)
    ratio = winner / len(x)
    return ratio

In [None]:
round_stats['nakamoto']  = [nakamoto_coeff(allrounds_finaldist[round_name],'PRAISE REWARDS') for round_name in roundname_list]
round_stats['nakamoto_ratio']= [nakamoto_coeff_ratio(allrounds_finaldist[round_name],'PRAISE REWARDS') for round_name in roundname_list]
px.line(round_stats,x='period_start_time',y='nakamoto',markers=True,title='Minimum number of people receiving 50% of total rewards')


In [None]:
px.line(round_stats,x='period_start_time',y='nakamoto_ratio',markers=True,title='Ratio of people accumulating 50% of total rewards in relation to total number of receivers in that round')


# Categorizing praise based on the praise reason

In [None]:
def categorize_praise(master_df,categ_keywords,save_csv=False):
    # clean the data
    allpraise_df = master_df[['REASON','AVG SCORE','TO USER ACCOUNT','DATE']]
    nonzerodf = allpraise_df.loc[(allpraise_df['AVG SCORE']>0) * (~allpraise_df['REASON'].isnull())]
    #print(f'among {len(allpraise_df)} praises, {len(nonzerodf)} have scores more than 0. Only include them')
    nonzerodf.insert(0,'CLEANED REASON',nonzerodf['REASON'].apply(praise_tools.clean_praise))

    # do categorization
    allcategs = []
    for kr,row in nonzerodf.iterrows():
        category = []
        praise = row['CLEANED REASON'].lower()
        for praise_type,keywords in categ_keywords.items():
            if search(keywords,praise):
                category.append(praise_type)
        if len(category):
            allcategs.append(category)
        else:
            allcategs.append(np.nan)
    category_df = pd.concat([nonzerodf.reset_index(), pd.DataFrame({"category":allcategs})],axis=1)
    if save_csv:
        # save the categorization into csv; there's a file including only uncategorized praise
        category_df.loc[category_df['category'].isnull()].to_csv('uncateogrized.csv')
        print(f"{sum(category_df['category'].isnull())} out of {len(category_df)} praises uncategorized")
        category_df.to_csv('categorized_praise.csv')
    
    #When there's a praise matching more than one category, they will be counted multiple times
    #organize the data for easier analysis
    categ_praise_scores = {k:[] for k in categ_keywords.keys()}

    for kr,row in category_df.iterrows():
        if type(row['category']) is list:
            for key in row['category']:
                categ_praise_scores[key] += [{'praise':row['REASON'],'avg_score':row['AVG SCORE'],'receiver':row['TO USER ACCOUNT'],'date':row['DATE']}]
    categ_praise_scores_df = dict.fromkeys(categ_keywords.keys())
    for key, item in categ_praise_scores.items():
        categ_praise_scores_df[key]= pd.DataFrame(item)
    return categ_praise_scores_df,category_df
def get_categ_stats(df,keywords):
    categ_stats = dict.fromkeys(keywords.keys())
    for categ in keywords.keys():
        if len(df[categ])==0: # empty category, skip this
            continue
        categ_stats[categ] = {'mean':np.mean(df[categ]['avg_score']),
                                'max':np.max(df[categ]['avg_score']),
                                'min':np.min(df[categ]['avg_score'])}
        
    categ_stats_df = pd.DataFrame(categ_stats)
    categ_stats_df = categ_stats_df.transpose().sort_values(by='mean')
    return categ_stats_df

In [None]:
categ_praise_df,category_df = categorize_praise(master_allrounds,categ_keywords,save_csv=True)

##  the average, min, max score of each category


In [None]:
categ_stats = get_categ_stats(categ_praise_df,categ_keywords)
categ_stats

In [None]:
# plot it out
categ_stats = get_categ_stats(categ_praise_df,categ_keywords)
categ_stats['max-mean'] = categ_stats['max'] - categ_stats['mean']
categ_stats['mean-min'] = categ_stats['mean'] - categ_stats['min']

fig=px.bar(categ_stats,y='mean',error_y='max-mean',error_y_minus='mean-min',title='average score of each category')
fig.show()
md('errorbars mark the maximum average score for this category')

## Top 3 highest scored praise in each category
A convenient way to check if the categorization keywords are reasonable.

In [None]:
mdtext = ''
for categ in categ_praise_df.keys():
    categ_name = '# '+categ + '\n'
    toppraise = categ_praise_df[categ].sort_values(by='avg_score',ascending=False).iloc[:3]
    top3_table= (f"\
| Avg. score | To | Reason | Date |\n \
|:-----------|----|--------|-----:|\n")
    for kr,row in toppraise.iterrows():
        to_user = row['receiver']
        reason = row['praise']
        score = row['avg_score']
        date = row['date'][:10]
                    
        top3_table += (f"| {score} | {to_user} | {reason} | {date} |\n")
        #print(f'Praise score average: {score}\nFROM {from_user} TO {to_user},reason:\n{reason}\n')
    mdtext += categ_name + top3_table    
md(mdtext)

## trend across time

In [None]:
mean_score_dict = {k:[] for k in categ_keywords.keys()}
praise_num_dict = {k:[] for k in categ_keywords.keys()}
for round_name in roundname_list:
    round_categ_praise_score_df,_ = categorize_praise(allrounds_df[round_name],categ_keywords)
    round_categ_stats = get_categ_stats(round_categ_praise_score_df,categ_keywords)
    for key in mean_score_dict.keys():
        try:
            mean_score_dict[key].append(float(round_categ_stats['mean'].loc[key]))
        except: 
            mean_score_dict[key].append(0.0)
        praise_num_dict[key].append(len(round_categ_praise_score_df[key]))
for key in mean_score_dict.keys():  
    round_stats[key+'_avg_score']=mean_score_dict[key]
    round_stats[key+'_praise_num']=praise_num_dict[key]

In [None]:
px.line(round_stats.filter(like='num'),title='number of praise in each category, across time')

In [None]:
px.line(round_stats.filter(like='_avg_score'),title='mean score of each category, across time')