In [4]:
import pandas as pd
import numpy as np
import plotly.express as px

In [5]:
datadir = '../giveth-rewards/distribution_rounds/'
rounds=np.arange(1,20)
# load all the praise
allpraise_df = pd.DataFrame()
for kr in rounds:
    df =pd.read_csv(f'{datadir}/round-{kr}/distribution_results/raw_csv_exports/extended_praise_data.csv')
    allpraise_df=pd.concat([allpraise_df,df[['REASON','AVG SCORE','TO USER ACCOUNT','DATE']]],axis=0)


In [6]:
allpraise_df

Unnamed: 0,REASON,AVG SCORE,TO USER ACCOUNT,DATE
0,for making edits in the welcome text,50.00,Suga#8514,2021-07-11T22:00:00.000Z
1,for making edits in the welcome text.,4.67,Vyvy-vi#5040,2021-07-11T22:00:00.000Z
2,for offering to help us improve some designs f...,27.00,acidlazzer#5796,2021-07-11T22:00:00.000Z
3,for invite me to play some music,7.67,chuygarcia.eth#6692,2021-07-11T22:00:00.000Z
4,for sharing material about TEC simulator and c...,21.00,markop#2007,2021-07-11T22:00:00.000Z
...,...,...,...,...
136,for the initiative he is taking in so many fro...,6.97,GideonRo#3175,2022-10-07T14:10:31.607Z
137,for pulling together a list of influencial Twi...,22.25,bdegraf#7201,2022-10-07T19:03:57.358Z
138,for the session on TE Consilience Library from...,9.25,GideonRo#3175,2022-10-09T10:17:31.329Z
139,for continuing to focus design for next phase ...,15.00,stef#9877,2022-10-09T10:25:20.779Z


In [7]:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

# cleaning master function
def clean_praise(praise):
    # code adapted from: https://ourcodingclub.github.io/tutorials/topic-modelling-python/
    my_stopwords = nltk.corpus.stopwords.words('english')
    word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem # clean words to the "stem" (e.g. words->word, talked->talk)
    my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

    praise = praise.lower() # lower case
    praise = re.sub('['+my_punctuation + ']+', ' ', praise) # strip punctuation
    praise = re.sub('\s+', ' ', praise) #remove double spacing
    praise = re.sub('([0-9]+)', '', praise) # remove numbers
    praise_token_list = [word for word in praise.split(' ')
                            if word not in my_stopwords] # remove stopwords

    praise_token_list = [word_rooter(word) if '#' not in word else word
                        for word in praise_token_list] # apply word rooter

    praise = ' '.join(praise_token_list)
    return praise

[nltk_data] Downloading package stopwords to /home/mitch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# topic modeling

## clean the language data

In [8]:
cleaned_praise = allpraise_df['REASON'].apply(clean_praise)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=5, token_pattern='\w+|\$[\d\.]+|\S+') # remove words appear less than 5 times or more than 90%

# apply transformation
tf = vectorizer.fit_transform(cleaned_praise).toarray() # term frequency

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
len(tf_feature_names) # total number of features

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [None]:
model.fit(tf)


In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

In [None]:
# TODO: get for each praise which  topic does it belong to
# Then, roughly check if the score relates to the praise

# Finally, compare using just some keywords for that.

# just categorize by keywords

In [10]:
nonzerodf = allpraise_df.loc[allpraise_df['AVG SCORE']>0]
nonzerodf.insert(0,'CLEANED REASON',nonzerodf['REASON'].apply(clean_praise))

print(f'among {len(allpraise_df)} praises, {len(nonzerodf)} have scores more than 0. Only include them. Next, clean them up.')

among 15322 praises, 15043 have scores more than 0. Only include them. Next, clean them up.


In [11]:
from re import search # for searching sub strings
type_keywords = {'attendance':'join|attend|show up|coming to','discussion':'question|ask|discuss|discussion|insight|participat|feedback|observations|forum|comment|share','work':'work|writing|hack|edit|studying|research|drafting|mediat|recording|taking notes|note taking|develop|analysis','lead':'host|lead|initiat|organizing|organizer|steward|forming|facilitate|managing','share':'share|spread','social media':'twitter|tweet|retweet|socials|social media','hardskills':'design|bug fixes|deploy|prototype|coding|python|javascript|solidity|data science','self-care':'vacation|time off|time-off|holiday|sabbatical|day off','IRL':'trip|DAOist|ETHcc|barcelona|denver|paris|amsterdam|colombia'}
allcategs = []
for kr,row in nonzerodf.iterrows():
    category = []
    praise = row['CLEANED REASON'].lower()
    for praise_type,keywords in type_keywords.items():
        if search(keywords,praise):
            category.append(praise_type)
    if len(category):
        allcategs.append(category)
    else:
        allcategs.append(np.nan)
category_df = pd.concat([nonzerodf.reset_index(), pd.DataFrame({"category":allcategs})],axis=1)


In [12]:
category_df.loc[category_df['category'].isnull()].to_csv('uncateogrized.csv')
print(f"{sum(category_df['category'].isnull())} out of {len(category_df)} praises uncategorized")
category_df.to_csv('categorized_praise.csv')

5683 out of 15043 praises uncategorized


In [13]:
pd.Series(type_keywords)

attendance                          join|attend|show up|coming to
discussion      question|ask|discuss|discussion|insight|partic...
work            work|writing|hack|edit|studying|research|draft...
lead            host|lead|initiat|organizing|organizer|steward...
share                                                share|spread
social media           twitter|tweet|retweet|socials|social media
hardskills      design|bug fixes|deploy|prototype|coding|pytho...
self-care       vacation|time off|time-off|holiday|sabbatical|...
IRL             trip|DAOist|ETHcc|barcelona|denver|paris|amste...
dtype: object

# analysis based on categorization
When there's a praise matching more than one category, they will be counted multiple times

In [14]:
categ_praise_scores = {k:[] for k in type_keywords.keys()}

for kr,row in category_df.iterrows():
    if type(row['category']) is list:
        for key in row['category']:
            categ_praise_scores[key] += [{'praise':row['REASON'],'avg_score':row['AVG SCORE'],'receiver':row['TO USER ACCOUNT'],'date':row['DATE']}]
categ_praise_scores_df = dict.fromkeys(type_keywords.keys())
for key, item in categ_praise_scores.items():
    categ_praise_scores_df[key]= pd.DataFrame(item)

##  the average, min, max score of each categorization

In [15]:
categ_stats = dict.fromkeys(type_keywords.keys())
for categ in categ_praise_scores_df.keys():
    categ_stats[categ] = {'mean':np.mean(categ_praise_scores_df[categ]['avg_score']),
                            'max':np.max(categ_praise_scores_df[categ]['avg_score']),
                            'min':np.min(categ_praise_scores_df[categ]['avg_score'])}
categ_stats_df = pd.DataFrame(categ_stats)
categ_stats_df.transpose().sort_values(by='mean')

Unnamed: 0,mean,max,min
attendance,3.181968,73.33,0.03
share,4.737731,73.33,0.1
social media,4.753523,47.75,0.13
discussion,7.699953,84.67,0.1
self-care,8.753333,25.33,2.5
lead,10.267938,125.67,0.13
work,13.020339,125.67,0.2
hardskills,14.17328,75.25,1.3
IRL,18.201429,84.67,1.5


## Top 3 highest scored praise in each category

In [16]:
from IPython.display import Markdown as md

mdtext = ''
for categ in categ_praise_scores_df.keys():
    categ_name = '# '+categ + '\n'
    toppraise = categ_praise_scores_df[categ].sort_values(by='avg_score',ascending=False).iloc[:3]
    top3_table= (f"\
    | Avg. score | To | Reason | Date |\n \
    |:-----------|----|:-------|\n")
    for kr,row in toppraise.iterrows():
        to_user = row['receiver']
        reason = row['praise']
        score = row['avg_score']
        date = row['date'][:10]
                    
        top3_table += (f"| {score} | {to_user} | {reason} | {date}\n")
        #print(f'Praise score average: {score}\nFROM {from_user} TO {to_user},reason:\n{reason}\n')
    mdtext += categ_name + top3_table    
md(mdtext)

# attendance
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 73.33 | karmaticacid#1218 | for attending ETHDenver and doing presentations & panels about our work within the TEC | 2022-02-19
| 55.0 | iviangita#3204 | for accepting the repsonisbility of acting Steward for the TEC and keeping tht WG afloat while we put out a call for new legal minds to join. | 2021-11-03
| 27.5 | mattyjee#8621 | for having improved the reward system so much since he joined | 2022-03-29
# discussion
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 84.67 | iviangita#3204 | for transcribing several months of praise given verbally during community calls -- a tedious and thankless task that brought a lot more praise into the system than would otherwise be ingested. I think that volume of praise input has helped us to better understand the system and will help to iterate it. | 2022-05-27
| 84.67 | natesuits#4789 | for helping me write and rewrite the forum post on the power of defaults: https://forum.tecommons.org/t/the-power-of-defaults-in-the-commons-configuration-dashboard/511/2 | 2021-07-25
| 80.33 | Nuggan#5183 | for helping me write and rewrite the forum post on the power of defaults: https://forum.tecommons.org/t/the-power-of-defaults-in-the-commons-configuration-dashboard/511/2 | 2021-07-25
# work
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 125.67 | sem(🌸,🐝)#0161 | for developing and testing the smart contracts. They are actually hosting a demo of the augmented bonding curve and all the commons upgrade tooling. Much admiration and respect for that | 2021-11-28
| 102.75 | wslyvh#1059 | for single handedly developing Tokenlog which the TEC has relied on for so many of our decisioning | 2022-05-19
| 89.0 | kristofer#1475 | for their front-end and back-end development work on the praise dashboard | 2022-04-21
# lead
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 125.67 | sem(🌸,🐝)#0161 | for developing and testing the smart contracts. They are actually hosting a demo of the augmented bonding curve and all the commons upgrade tooling. Much admiration and respect for that | 2021-11-28
| 68.0 | mZ#3472 | for thought leadership in web3 and for keeping engineering ethics as our community’s North Star | 2021-12-25
| 63.5 | elessar.eth#7945 | for developing and testing the smart contracts. They are actually hosting a demo of the augmented bonding curve and all the commons upgrade tooling. Much admiration and respect for that | 2021-11-28
# share
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 73.33 | sem(🌸,🐝)#0161 | for the incredible work with the demos, making improvements, managing the tech team and for being such a good teacher across the space and sharing his knowledge | 2021-07-15
| 68.0 | sem(🌸,🐝)#0161 | for engaging in discussion on the TE Commons Forum (https://forum.tecommons.org) the past week. Thank you for helping our Token Engineering Commons community share and learn! | 2021-07-15
| 59.33 | sem(🌸,🐝)#0161 | for the Real Time Launch action! Great sharing the war room with you!!! | 2022-01-24
# social media
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 47.75 | iviangita#3204 | for finding out our twitter account as block and fix it super quick!!⚡ (We need to be carefull with bots) | 2021-09-08
| 42.0 | iviangita#3204 | for all the incredible behind-the-scenes work and all the little things she’s constantly doing in the back office helping with Twitter, the board and work agreements | 2021-09-02
| 33.33 | innov8tor3#3988 | for mentioning or retweeting TE Commons on the socials the past week! Thank you for helping us grow the Token Engineering Commons community and spreading the message! 🙏🏼☺️ | 2021-07-15
# hardskills
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 75.25 | Mert Ozd#6679 | for product managing the proposal inverter, including faciliating the weekly meetings, working through UX and designs issues and making sure payments are organized | 2022-05-25
| 63.67 | VitorNunes#0090 | for all the work on the CCD, the designs, comments, user testing and making things understandable for people | 2021-08-19
| 57.67 | sem(🌸,🐝)#0161 | for working overtime seeming all the time to get all the commons upgrade pieces deployed | 2022-01-20
# self-care
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 25.33 | Suga#8514 | for getting out the newsletter when there's been a huge fire in the area, she had nowhere to stay, and she's on holiday with her family, and for making sure the AMA was covered. Above and beyond! | 2021-08-03
| 15.75 | markop#2007 | for being so supportive across the commons. I've witnessed in my travels many occurrences of resources and time offered to community members in a handful of channels. <a:culture:770033606080856094> | 2021-11-26
| 12.75 | griff (💜, 💜)#8888 | for showing up to so many meetings even though they’re on holiday | 2021-11-25
# IRL
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 84.67 | Tam2140#9361 | Great 18 minute talk at ETH Denver (https://www.youtube.com/watch?v=7UvPCDbBKcQ). You must've prepped for hours for the talk and the slides you prepared. | 2022-04-22
| 82.0 | Tam2140#9361 | for supporting to accomodate people going to Amsterdam for Devconnect. Your clarity really helped us feel more calm regarding our staying plans. | 2022-04-11
| 76.5 | chuygarcia.eth#6692 | for being the TEC fam who went to Amsterdam and represented our Commons during all those events | 2022-05-04


# TODO
- maybe further adjust keyword to make the top scores look normal
- how to make keywords a manipulable setting in json?
- incorporate this into cross-period analysis