In [1]:
import re
from collections import Counter

import os.path
import matplotlib.pyplot as plt
import operator
import pandas as pd
import praw
import os as os
import pickle 
pd.options.display.max_columns = 999

from os import listdir
from os.path import isfile, join

import math

#Visualization imports
from bokeh.plotting import figure, show
from bokeh.core.properties import value
from bokeh.models.widgets import Panel, Tabs
from bokeh.io import show, output_notebook
output_notebook()
import numpy as np
from bokeh.models import ColumnDataSource
from bokeh.transform import dodge

#Keywords that we chose to anaylize
keys = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']


In [2]:
#https://github.com/chbrown/liwc-python
def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

import liwc
parse, category_names = liwc.load_token_parser('LIWC2007_English080730.dic')

## Analyze all the comments together

In [2]:
#From the given POSTS of each article from reddit we take all the comments and analyze them as it were one text all together
def sentiment_analysis_all_comments(df, date=True):
    keys = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']
    comments = df['body'].str.cat(sep=',')
    if (len(comments)>2):
        tok = tokenize(comments)
        counter = Counter(category for token in tok for category in parse(token))
        key = list(counter.keys())
        dictionary = dict(counter)
        total = len(comments.split())
        dictionary_norm = {k: v / total for k, v in dictionary.items()}
        sorted_dictionary_norm = sorted(dictionary_norm.items(), key=operator.itemgetter(1), reverse=True)
    else:
        key = keys
        dictionary_norm = {k: 0.0 for k in keys}
    return [dictionary_norm[x] if x in dictionary_norm else 0.0 for x in keys ]

## Analyze by separate comments

In [3]:
#From the given POSTS of each article from reddit we analyze all the comments of each POST as one, and once we have the results for each POST we do the average
def sentiment_analysis_separate_comments(df, date=True):
    keys = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']
    
    dict_total = dict.fromkeys(keys, 0)
    if(len(df['body'])):
        for comm in df['body']:
            tok = tokenize(comm)
            total = len(comm.split())
            counter = dict(Counter(category for token in tok for category in parse(token)))
            dictionary = dict(counter)
            dictionary = {k: v / total for k, v in dictionary.items()}
            dict_total = { k: dict_total.get(k, 0) + dictionary.get(k, 0) for k in set(dict_total) | set(dictionary) }
    else:
        dictionary_norm = {k: 0.0 for k in keys}
        return [dictionary_norm[x] for x in keys]
    return [dict_total[x]/len(df['body']) for x in keys]

## Analyze and save comments

In [4]:
'''
#We do the sentiment analysis for separate comments, and all the comments together. We save the results
import pickle
keys = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']

folder = os.listdir(folderpath)
for file in folder:
    print (file)
    list_all = sentiment_analysis_all_comments(folderpath+file)
    list_separate = sentiment_analysis_separate_comments(folderpath+file)
    with open(folder_to+'sentiment_tog_'+file, 'wb') as fp:
        pickle.dump(list_all, fp)
    with open(folder_to+'sentiment_sep_'+file, 'wb') as fp:
        pickle.dump(list_separate, fp)
'''

"\n#We do the sentiment analysis for separate comments, and all the comments together. We save the results\nimport pickle\nkeys = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']\n\nfolder = os.listdir(folderpath)\nfor file in folder:\n    print (file)\n    list_all = sentiment_analysis_all_comments(folderpath+file)\n    list_separate = sentiment_analysis_separate_comments(folderpath+file)\n    with open(folder_to+'sentiment_tog_'+file, 'wb') as fp:\n        pickle.dump(list_all, fp)\n    with open(folder_to+'sentiment_sep_'+file, 'wb') as fp:\n        pickle.dump(list_separate, fp)\n"

In [5]:
'''
#save dictionary of number of posts collected from each article
folder = os.listdir(folderpath)
dictionary = {}
for file in folder:
    df = pd.read_csv(folderpath+file, index_col=0)
    dictionary[file] = len(df)
with open(folder_to+'num_posts', 'wb') as fp:
        pickle.dump(dictionary, fp)
'''

"\n#save dictionary of number of posts collected from each article\nfolder = os.listdir(folderpath)\ndictionary = {}\nfor file in folder:\n    df = pd.read_csv(folderpath+file, index_col=0)\n    dictionary[file] = len(df)\nwith open(folder_to+'num_posts', 'wb') as fp:\n        pickle.dump(dictionary, fp)\n"

## Full Reddit Dataset. Preprocessing Data

In [6]:
#From the reddit dataset save a .csv file with the monthly information of the sentiments, and the number of comments, number of deleted and removed comments.
def preprocessing_data_and_sentiment_analysis():

    #Doing a list with the name of the available files for analyzing
    onlyfiles = [f for f in listdir('./data/comments') if isfile(join('./data/comments', f))]
    onlyfiles.sort()

    #List with the keywords that we want to analyze in the reddit dataset
    keywords = list(pd.read_table('data/reddit_keywords.txt', header=-1)[0])

    #Getting the data for each keyword, monthly, of the deleted comments, the removed comments, and the total number of comments
    df_removed_time = pd.DataFrame(0, columns=['sentiment', 'keyword']+onlyfiles, index=np.arange(len(keywords)))
    df_removed_time['sentiment'] = 'num_removed'
    df_removed_time['keyword'] = keywords
    df_deleted_time = pd.DataFrame(0, columns=['sentiment', 'keyword']+onlyfiles, index=np.arange(len(keywords)))
    df_deleted_time['sentiment'] = 'num_deleted'
    df_deleted_time['keyword'] = keywords
    df_num_comm = pd.DataFrame(0, columns=['sentiment', 'keyword']+onlyfiles, index=np.arange(len(keywords)))
    df_num_comm['sentiment'] = 'num_comments'
    df_num_comm['keyword'] = keywords

    for file in onlyfiles:
        df = pd.read_csv('data/comments/'+file)
        df_removed = df[df.body=='[removed]']
        df_deleted = df[df.body=='[deleted]']
        for keyword in keywords:
            df_removed_time.at[df_removed_time[df_removed_time.keyword==keyword].index[0], file]=len(df_removed[df_removed['keyword']==keyword])
            df_deleted_time.at[df_deleted_time[df_deleted_time.keyword==keyword].index[0], file]=len(df_deleted[df_deleted['keyword']==keyword])                          
            df_num_comm.at[df_num_comm[df_num_comm.keyword==keyword].index[0], file]=len(df[df['keyword']==keyword])

    df_total = pd.concat([df_num_comm, df_deleted_time, df_removed_time], axis=0, ignore_index=True)

    #Getting the data of the sentiment analysis for each month and keyword

    keywords = list(pd.read_table('data/reddit_keywords.txt', header=-1)[0])
    sentiments = ['past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep']

    #Initializing the dataframe to put the results in it
    for sentiment in sentiments+['comments_analyzed']:   
        df_add = pd.DataFrame(0.0, columns=['sentiment', 'keyword']+onlyfiles, index=np.arange(len(keywords)))
        df_add['sentiment'] = sentiment
        df_add['keyword'] = keywords
        df_total = pd.concat([df_total, df_add], axis=0, ignore_index=True)

    for file in onlyfiles:
        print()
        print('------------------------------------------------------------------------------------------')
        print(file)
        print('------------------------------------------------------------------------------------------')

        #Loading files with the reddit comments
        df = pd.read_csv('data/comments/'+file)
        #We are not analyzing for the sentiment analysis the removed or deleted comments
        df = df[(df.body!='[removed]') & (df.body!='[deleted]')]
        for keyword in keywords:
            df_k = df[df['keyword']==keyword]
            #Doing the sentiment analysis
            sent_list = sentiment_analysis_all_comments(df_k)
            #Saving results of the sentiment analysis
            for i, sentiment in enumerate(sentiments):
                df_total.loc[df_total[(df_total.keyword==keyword) & (df_total.sentiment==sentiment)].index[0], file] = sent_list[i] 
                df_total.loc[df_total[(df_total.keyword==keyword) & (df_total.sentiment=='comments_analyzed')].index[0], file] = len(df_k)
    df_total.to_csv('data/sentiment_analysis_data.csv', index=False)         

In [7]:
#From the list of monthly sentiments creates a global table, not monthly
def create_summarized_table_sentiments():  
    df_total = pd.read_csv('data/sentiment_analysis_data.csv')
    df = pd.DataFrame(index=keywords, columns=sentiments)
    list_comments = {}
    for idx, row in df_total.iterrows():
        if (row.sentiment not in ['num_comments', 'num_deleted', 'num_removed', 'comments_analyzed']):
            analyzed = df_total[(df_total.sentiment==row.sentiment) & (df_total.keyword==row.keyword)]
            tot = sum(analyzed.iloc[0, 5:])
            sent = sum([row[i]*analyzed.iloc[0, i] for i in range(5, len(df_total.columns))])/tot
            df.loc[row.keyword, row.sentiment] = sent
        elif(row.sentiment == 'comments_analyzed'):
            analyzed = df_total[(df_total.sentiment==row.sentiment) & (df_total.keyword==row.keyword)]
            tot = sum(analyzed.iloc[0, 5:])
            list_comments[row.keyword] = tot
        #    df.loc[row.keyword, row.sentiment] = sum(row[5:])
    df.drop(df[df.present.isnull()].index, axis=0, inplace=True)
    df.to_csv('data/summarized_sentiments.csv')
    with open('data/list_num_comments_analyzed', 'wb') as fp:
        pickle.dump(list_comments, fp)


## Plots

In [8]:
# Plot the monthly sentiments of the given articles to compare different keywords in a given sentiment
color =["#e6194b","#3cb44b","#ffe119","#0082c8","#f58231","#911eb4","#46f0f0","#f032e6","#d2f53c","#fabebe","#008080","#e6beff","#aa6e28","#fffac8","#800000","#aaffc3","#808000","#ffd8b1","#000080","#808080","#FFFFFF","#000000","#1F77B4", "#B85A0D", '#878787', "#E377C2", '#66ccff', '#cc0066', '#cccc00', '#98DF8A', "#82853B","#92a8d1","#034f84","#f7cac9","#f7786b","#d5f4e6","#80ced6","#fefbd8","#618685","#ffef96","#50394c","#b2b2b2","#f4e1d2","#deeaee","#b1cbbb","#eea29a","#c94c4c","#3e4444","#82b74b","#405d27","#c1946a","#b9936c","#dac292",
"#e6e2d3","#c4b7a6","#6b5b95","#feb236","#d64161","#ff7b25","#eaece5","#b2c2bf","#c0ded9","#3b3a30","#c8c3cc","#563f46","#8ca3a3","#484f4f","#686256",
"#c1502e","#587e76","#a96e5b","#bccad6","#8d9db6","#667292","#f1e3dd","#cfe0e8","#b7d7e8","#87bdd8","#daebe8","#fbefcc","#f9ccac","#f4a688","#e0876a",
"#f9d5e5","#eeac99","#e06377","#c83349","#5b9aa0","#d6d4e0","#b8a9c9","#622569","#96ceb4","#ffeead","#ffcc5c","#ff6f69","#588c7e","#f2e394","#f2ae72","#d96459"]*5

def plot_monthly_sentiments(keywords,log=False, sentiment='num_comments'):
    df = pd.read_csv('data/sentiment_analysis_data.csv')
    df.drop(['RC_2017-01.csv', 'RC_2017-02.csv', 'RC_2017-03.csv'], axis=1, inplace=True)
    axis_type = 'linear'
    if (log):
        axis_type = 'log'
    df1 = pd.DataFrame(columns=df.columns)
    for keyword in keywords:
        df1 = pd.concat([df1, df[(df.keyword==keyword) & (df.sentiment==sentiment)]])
    df=df1.copy()
    for col in df.columns[2:]:
        df.rename(columns={col:col[0:-4]}, inplace = True)

    data = {'keywords': list(df.columns[2:])}
    for i, row in df.iterrows():
        data[row.keyword] = list(row.values[2:])
    source = ColumnDataSource(data=data)

    p = figure(x_range=list(df.columns[2:]), plot_width = 970, title= sentiment,  
               y_axis_type=axis_type)
    i=0
    for indx, row in df.iterrows():
        p.line(data['keywords'], data[row.keyword], color=color[i], legend=value(row.keyword), line_width=1.5)
        i+=1
    p.xaxis.major_label_orientation = math.pi/3
    p.grid.grid_line_alpha=1
    p.x_range.range_padding = 0.01
    p.legend.location = "top_left"
    p.legend.click_policy="hide"
    
    tab = Panel(child=p, title=sentiment)
    return tab

def plot_monthly_multiple_sentiments(keywords, log=False):
    tabs=[]
    sentiments = ['num_comments', 'num_deleted', 'num_removed', 'past', 'present', 'future', 'swear', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'discrep', 'comments_analyzed']
    for sentiment in sentiments:
        tab=plot_monthly_sentiments(keywords, log=log, sentiment=sentiment)
        tabs.append(tab)
    tabs = Tabs(tabs=tabs)
    show(tabs)


#### Method to plot the sentiment analysis in a summarized way (not monthly) 

In [9]:
#Plot the Sentiment analysis of the given articles for all the months together in a multibar visualization

color =["#e6194b","#3cb44b","#ffe119","#0082c8","#f58231","#911eb4","#46f0f0","#f032e6","#d2f53c","#fabebe","#008080","#e6beff","#aa6e28","#fffac8","#800000","#aaffc3","#808000","#ffd8b1","#000080","#808080","#FFFFFF","#000000","#1F77B4", "#B85A0D", '#878787', "#E377C2", '#66ccff', '#cc0066', '#cccc00', '#98DF8A', "#82853B","#92a8d1","#034f84","#f7cac9","#f7786b","#d5f4e6","#80ced6","#fefbd8","#618685","#ffef96","#50394c","#b2b2b2","#f4e1d2","#deeaee","#b1cbbb","#eea29a","#c94c4c","#3e4444","#82b74b","#405d27","#c1946a","#b9936c","#dac292",
"#e6e2d3","#c4b7a6","#6b5b95","#feb236","#d64161","#ff7b25","#eaece5","#b2c2bf","#c0ded9","#3b3a30","#c8c3cc","#563f46","#8ca3a3","#484f4f","#686256",
"#c1502e","#587e76","#a96e5b","#bccad6","#8d9db6","#667292","#f1e3dd","#cfe0e8","#b7d7e8","#87bdd8","#daebe8","#fbefcc","#f9ccac","#f4a688","#e0876a",
"#f9d5e5","#eeac99","#e06377","#c83349","#5b9aa0","#d6d4e0","#b8a9c9","#622569","#96ceb4","#ffeead","#ffcc5c","#ff6f69","#588c7e","#f2e394","#f2ae72","#d96459"]*5

def plot_multibar_sentiment(articles):
    keys = ['past', 'present', 'future', 'swear', 'affect', 'positive', 'negative', 'anxiety', 'anger', 'sadness', 'discrepancy']
    
    df = pd.read_csv('data/summarized_sentiments.csv', index_col=0)
    df.rename(index=str, columns={"posemo": "positive", "negemo": "negative", "anx":"anxiety", "sad":"sadness", "discrep": "discrepancy"}, inplace=True)
    with open('data/list_num_comments_analyzed', 'rb') as fp:
        dict_num_posts = pickle.load(fp)
    art = articles.copy()
    for article in art: 
        #We only plot the article with more than 50 comments
        if (dict_num_posts[article]<50):
            articles.remove(article)
    data = {'LIWC' : keys}
    for i, article in enumerate(articles):
        data[article] = list(df.loc[article])
    for art in articles:
        print('Number of posts '+ art +': '+ str(dict_num_posts[art]))
    source = ColumnDataSource(data=data)
    p = figure(x_range=keys, plot_height=500, plot_width = 970, title="Sentiment Analysis",
               toolbar_location=None, tools="")

    incr=0
    base = 0.8/len(articles)
    for i,art in enumerate(articles):
        p.vbar(x=dodge('LIWC', -base+incr, range=p.x_range), top=art, width=base-0.05, source=source,
           color=color[i], legend=value(art))
        incr+=base

    #p.vbar(x=dodge('Topics',  0.25, range=p.x_range), top='2017', width=0.2, source=source,
           #color="#e84d60", legend=value("2017"))

    p.x_range.range_padding = 0.1
    p.xgrid.grid_line_color = None
    p.legend.location = "top_right"
    p.legend.click_policy="hide"

    show(p)

## Main Visualization: Sentiment Analysis for different articles

Visualization of the values of the sentiment analysis of the given articles.

In [10]:
#Select the articles and the topic (social or technological) to visualize.
articles = ['gdpr', 'fake news', 'intellectual property']
plot_multibar_sentiment(articles)

Number of posts gdpr: 35332.0
Number of posts fake news: 510531.0
Number of posts intellectual property: 21335.0


## Ranking Sentiments

In [11]:
def ranking_pos(sentiment, top_x=10):
    df = pd.read_csv('data/summarized_sentiments.csv', index_col=0)
    with open('data/list_num_comments_analyzed', 'rb') as fp:
        dict_num_posts = pickle.load(fp)
    df.drop([key for key, value in dict_num_posts.items() if (value<150 and key in df.index)], inplace=True)

    df.dropna(axis=0, how='any', inplace=True)
    df.sort_values([sentiment], ascending=False, inplace=True)
    #return the articles with maximum value in the given sentiment
    #if (max_values): return sorted(dict_rank, key=dict_rank.get, reverse=True)[:top_x]
    #return the articles with minimum value in the given sentiment
    #else: return sorted(dict_rank, key=dict_rank.get, reverse=False)[:top_x]
    #return pd.concat([df[0:top_x//2], df[-top_x//2:]], axis=0)
    return df[0:top_x]
    #return sorted(dict_rank, key=dict_rank.get, reverse=True)[:top_x//2]+sorted(dict_rank, key=dict_rank.get, reverse=False)[:top_x//2]

In [12]:
def plot_ranking(sentiment, top_x=10, max_values=True):
    articles = ranking_pos(sentiment, top_x)
    plot_multibar_sentiment(articles)

Plot the X articles with higher and lower results in the sentiment analysis for each sentimentthe given sentiment

In [13]:
plot_multibar_sentiment(['gdpr', 'fake news','intellectual property' ])

Number of posts gdpr: 35332.0
Number of posts fake news: 510531.0
Number of posts intellectual property: 21335.0


In [14]:
sentiment = 'negemo'
plot_multibar_sentiment(list(ranking_pos(sentiment).index))

Number of posts deletefacebook: 14994.0
Number of posts brain-computer interface: 549.0
Number of posts autonomous weapon: 712.0
Number of posts competition law: 508.0
Number of posts hate speech: 92381.0
Number of posts killer robot: 17593.0
Number of posts metoo: 159680.0
Number of posts digital surveillance: 557.0
Number of posts altright: 37450.0
Number of posts freedom of speech: 53294.0


- Write in the variable 'sentiment', the sentiment that you want to analyze. Choose one of the shown in the variable 'keys'.
- Write in 'top_x' the number of articles that you want to plot. top_x=8, plot the 4 articles with higher values in the given sentiment, and the 4 with lower. 

In [17]:
with open('data/list_num_comments_analyzed', 'rb') as fp:
        dict_num_posts = pickle.load(fp)
dict_num_posts

{'3gpp': 46.0,
 '5g': 164891.0,
 '5g standard': 54.0,
 'ai algorithm': 929.0,
 'ai assistant': 948.0,
 'ai chip': 1345.0,
 'ai startup': 674.0,
 'ai system': 2685.0,
 'aidriven': 0.0,
 'algorithmic bias': 14.0,
 'algorithmic discrimination': 9.0,
 'algorithmic regulation': 0.0,
 'algorithms': 45393.0,
 'altright': 37450.0,
 'amazon': 1935638.0,
 'anticipatory governance': 0.0,
 'apple inc': 1861.0,
 'artificial general intelligence': 790.0,
 'artificial intelligence': 73491.0,
 'artificial neural network': 853.0,
 'augmented reality': 24077.0,
 'automated reasoning': 18.0,
 'autonomous car': 17214.0,
 'autonomous vehicle': 10669.0,
 'autonomous weapon': 712.0,
 'backward compatibility': 18104.0,
 'bci': 8353.0,
 'big data': 7842.0,
 'bitcoin': 2296257.0,
 'black box': 15483.0,
 'blockchain': 320144.0,
 'blockchain platform': 4486.0,
 'brain-computer interface': 549.0,
 'braveheart effect': 0.0,
 'cambridge analytica': 73406.0,
 'censorship': 189965.0,
 'chinese tech': 2722.0,
 'circula

In [15]:
#PLOT MONTHLY SENTIMENTS
plot_monthly_multiple_sentiments(['deletefacebook', 'brain-computer interface', 'autonomous weapon', 'competition law', 'hate speech', 'killer robot', 'metoo', 'digital surveillance', 'altright', 'freedom of speech'])

In [16]:
#PLOT MONTHLY SENTIMENTS
plot_monthly_multiple_sentiments(['twitter', 'google', 'facebook', 'amazon', 'zte', 'tencent', 'internet'])

In [17]:
pd.read_csv('data/sentiment_analysis_data.csv')

Unnamed: 0,sentiment,keyword,RC_2017-01.csv,RC_2017-02.csv,RC_2017-03.csv,RC_2017-04.csv,RC_2017-05.csv,RC_2017-06.csv,RC_2017-07.csv,RC_2017-08.csv,RC_2017-09.csv,RC_2017-10.csv,RC_2017-11.csv,RC_2017-12.csv,RC_2018-01.csv,RC_2018-02.csv,RC_2018-03.csv,RC_2018-04.csv,RC_2018-05.csv,RC_2018-06.csv,RC_2018-07.csv,RC_2018-08.csv
0,num_comments,human-computer interaction,2.0,2.0,0.0,5.0,0.0,6.0,0.0,4.0,5.0,1.0,0.0,40.0,7.0,5.0,9.0,12.0,3.0,15.0,1.0,0.0
1,num_comments,open platform,92.0,3.0,0.0,18.0,20.0,6.0,1.0,13.0,7.0,1.0,116.0,8.0,61.0,174.0,117.0,48.0,73.0,172.0,59.0,126.0
2,num_comments,open-source software,9.0,31.0,0.0,254.0,93.0,16.0,5.0,148.0,131.0,23.0,1.0,18.0,37.0,20.0,65.0,382.0,76.0,90.0,13.0,48.0
3,num_comments,digital commons,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,num_comments,temporary work,40.0,124.0,3.0,114.0,31.0,38.0,61.0,91.0,75.0,202.0,108.0,108.0,118.0,49.0,80.0,214.0,71.0,33.0,305.0,50.0
5,num_comments,digital identity,2.0,33.0,0.0,5.0,55.0,30.0,46.0,1.0,21.0,23.0,59.0,12.0,19.0,63.0,38.0,308.0,6.0,21.0,15.0,45.0
6,num_comments,smart city,33.0,59.0,0.0,73.0,33.0,129.0,97.0,40.0,102.0,150.0,4793.0,123.0,84.0,104.0,66.0,102.0,79.0,119.0,24.0,67.0
7,num_comments,robot tax,22.0,126.0,0.0,230.0,90.0,28.0,3.0,99.0,338.0,1018.0,0.0,64.0,14.0,7.0,12.0,11.0,1.0,0.0,2.0,0.0
8,num_comments,blockchain,3501.0,3867.0,84.0,3795.0,5089.0,8199.0,7698.0,11017.0,10176.0,15551.0,13150.0,23515.0,29859.0,27100.0,34645.0,37744.0,38446.0,40684.0,28432.0,29484.0
9,num_comments,sustainability,299.0,351.0,4.0,592.0,686.0,4673.0,611.0,400.0,285.0,617.0,609.0,574.0,620.0,602.0,527.0,1000.0,1416.0,718.0,721.0,2240.0


In [18]:
set(pd.read_csv('data/sentiment_analysis_data.csv').sentiment)

{'affect',
 'anger',
 'anx',
 'comments_analyzed',
 'discrep',
 'future',
 'negemo',
 'num_comments',
 'num_deleted',
 'num_removed',
 'past',
 'posemo',
 'present',
 'sad',
 'swear'}

In [9]:
pd.read_csv('data/summarized_sentiments.csv', index_col=0).head()

Unnamed: 0,past,present,future,swear,affect,posemo,negemo,anx,anger,sad,discrep
human-computer interaction,0.104476,0.069073,0.013627,0.011799,0.068891,0.065284,0.015819,0.005227,0.013402,0.002917,0.064798
open platform,0.025649,0.137604,0.019203,0.008909,0.068433,0.051659,0.025525,0.005797,0.025469,0.006276,0.025265
open-source software,0.02701,0.087632,0.01323,0.002933,0.059512,0.048287,0.018546,0.003263,0.008498,0.003537,0.024404
temporary work,0.02331,0.08126,0.01068,0.005202,0.046297,0.030836,0.016434,0.0025,0.009245,0.002658,0.020701
digital identity,0.01682,0.099356,0.012082,0.003806,0.068799,0.055561,0.017415,0.005032,0.010696,0.003135,0.022286
