# Initializsation

import all relevant libraries and set options

In [None]:
import warnings; warnings.simplefilter('ignore')
from IPython.display import Image, HTML, display
import os
import pyLDAvis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
from datetime import datetime, date
from typing import Dict, DefaultDict, Set, List
import seaborn as sns
import plotly
import yaml


pd.set_option("display.max_rows", None)
desired_width =200
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 10)

In [None]:
import so_ana_doc_worker.so_ana_reporting as so_ana_reporting
from so_ana_sqlalchemy_models.db_deps import prod_db_deps_container, dict_to_es_key
import so_ana_management.management_utils as so_ana_mu
from so_ana_util import data_access
import so_ana_util

# Get db connection data

In [None]:
deps = prod_db_deps_container()

# Load relevant analysis objects

In order to access some job different from the latest one, browse the data frame "all_jobs" and replace flow_run_id by the desired value.

In [None]:
all_jobs = so_ana_reporting.ReportingDataAccess.load_all_job_data(deps.conn)
flow_run_id = all_jobs.sort_values('started_at_timest', ascending=False).iloc[0, :]['flow_run_id']
flow_run_id='a9507a44-d173-4de4-99f4-5e8a9c181b96'

display(HTML(f'<h1>Last flow-run-id is: "{flow_run_id}""</h1>'))
display(HTML('<hr width="85%" align="left">'))
print()

all_steps = so_ana_reporting.ReportingDataAccess.load_all_step_data_for_flow_run_id(  connection=deps.conn,
                                                                                      flow_run_id=flow_run_id)

flow_opts=all_jobs[all_jobs['flow_run_id']==flow_run_id]['flow_opts'].iloc[0]

TAG_LABEL=flow_opts['topic']
NR_TOPICS=flow_opts['ml_opts']['num_topics']
BASE_OUTPUT=os.path.join(os.path.join(so_ana_util.PROJ_OUTP_PATH, flow_run_id), 'jupyter_LDA')
try:
    os.mkdir(BASE_OUTPUT)
except FileExistsError:
    pass

display(HTML(f'<h1>Steps for flow-run-id="{flow_run_id}"</h1>'))
display(all_steps.sort_index())
print()
display(HTML(f'<h1>Step results</h1>'))
step_res_dict = {idx: item['result'] for idx, item in all_steps.iterrows()}
print(yaml.dump(step_res_dict))
display(HTML('<hr width="85%" align="left">'))
display(HTML(f'<h1>All jobs available</h1>'))
display(all_jobs)
print()
display(HTML('<h1>flow config</h1>'))
print(yaml.dump(flow_opts))

# Load relevant artefacts for flow run (LDA visualisaton model, gensim dictionary [...]

In [None]:
rep_data_access = so_ana_reporting.ReportingDataAccess(flow_run_id)
step_result_dict = rep_data_access.load_job_step_data(sqla_session=deps.session)
artefacts = rep_data_access.load_artefacts(deps.session)
wc_report_obj = so_ana_reporting.WCReports( deps_obj=deps,
                                            step='#3',
                                            step_label=step_result_dict['#3'].step_label
                                          )

artefact_dict = {(item['step'], item['artefact_key']): item for item in artefacts}

rep_wordcloud_artefact = artefact_dict[('#7', 'wordcloud')]
rep_LDAvis_artefact = artefact_dict[('#7', 'LDAvis')] 
lda_model_artefact = artefact_dict[('#6', 'LDA_model_data')] 
dict_artefact = artefact_dict[('#5', 'dictionary')] 

lda_model = so_ana_reporting.LDA_from_artefact(lda_model_artefact)
dictionary = so_ana_reporting.dictionary_from_artefact(dict_artefact)

display(HTML('<h1>Artefacts</h1>'))
print(yaml.dump(artefact_dict))

# Word clouds

## Full Corpus

In [None]:
Image(os.path.join(rep_wordcloud_artefact['artefact_value']['base_path'], 
                                   rep_wordcloud_artefact['artefact_value']['file_name']),
     width=1000)

## Topics

In [None]:
   
    display(HTML('<h1>Wordclouds per topic</h1>'))
    display(HTML('<hr width="85%" align="left">'))
    for topic in range(NR_TOPICS):
        display(HTML(f'<h2>Topic {topic+1}</h2>'))
        fig= plt.figure(figsize=(20,10))
        img = so_ana_reporting.WCReports.wc_for_topics_of_topic_model( lda_model_obj=lda_model,
                                                                       topic_nr=topic,
                                                                       file_name=None,
                                                                       topn=100)
        img
        img.show()
        display(HTML('<hr width="85%" align="left">'))
        print()

In [None]:
ord_key  =0
    
display(HTML('<h1>Wordclouds for first document</h1>'))
print()

wc_report = so_ana_reporting.WCReports(deps_obj=deps,
                                       step='#5',
                                       step_label=all_steps.loc['#5']['step_label'])

fig = plt.figure(figsize=(20,10))
img = wc_report.wc_for_doc(ord_key=ord_key, file_name=None)
img.show()
    
display(HTML('<hr width="85%" align="left">'))

# LDAvis Analysis

In [None]:
display(HTML('<h1>Mapping Gensim-Topics -> LDAVis-Topics</h1>'))
for i, item in enumerate(sorted(artefact_dict[('#7', 'doc_topic_weights')]['artefact_value']['total_weight_sum'].items(), key=lambda x: x[1], reverse=True)):
    print(f' {item[0]} (weight="{item[1]}") -> {i+1}')

In [None]:
display(HTML('<h1>LDAvis-Darstellung</h1>'))
display(HTML(os.path.join(rep_LDAvis_artefact['artefact_value']['base_path'], 
                          rep_LDAvis_artefact['artefact_value']['file_name_html']
                         )
            )
       )

In [None]:
display(HTML('<h1>Länge des Vokabulars</h1>'))
print(len(dictionary))

# Load Post information

Create data frame with document meta data

In [None]:
import math

def get_ext_post_meta_data(step_1_label, step_7_label, nr_topics):
    qu = 'select * from so_ana_doc_worker.page_meta_info where step=%(step)s and step_label=%(step_label)s'

    base = pd.read_sql(sql=qu, con=deps.conn, params={'step': '#1', 'step_label': step_1_label})
    base.set_index('ord_key', inplace=True)

    qu2 = 'select ord_key, topic_id, topic_weight from so_ana_analysis.topic_distribution where step=%(step)s and step_label=%(step_label)s and topic_id=%(topic_id)s'

    for i in range(nr_topics):
        pd_new = pd.read_sql(sql=qu2, con=deps.conn, params={'step': '#7', 'step_label': step_7_label, 'topic_id': i})
        pd_new.set_index('ord_key', inplace=True)
        base[f'w_topic_{i}'] = pd_new['topic_weight']

    base.fillna(0, inplace=True)
    return base


def x_log_x(x, thrs=1E-10):
    if x < thrs:
        return -x*math.log2(thrs)
    else:
        return -x*math.log2(x)

class DFEntropyMapper:
    
    def __init__(self, label_template, topic_number):
        self.label_template = label_template
        self.topic_number = topic_number
        
    def __call__(self, row):    
        ret=0.0
        for i in range(self.topic_number):
            ret+= x_log_x(row[self.label_template.format(nr=i)])
        return ret


base = get_ext_post_meta_data(step_1_label=all_steps.loc['#1']['step_label'],
                              step_7_label=all_steps.loc['#7']['step_label'],
                              nr_topics=10)

base['topic_entropy'] = base.apply(DFEntropyMapper('w_topic_{nr}', NR_TOPICS), axis=1)

base.head()

In [None]:
hist_label_list = ['votes', 'answers', 'answer_status', 'views']

display(HTML('<h1>Histograms for quantitative post meta data</h1>'))

for plt_data in hist_label_list:
    rel_data = base[plt_data]
    display(HTML(f'<h2>Histogram ({plt_data})'))
    sns.histplot(rel_data, bins=100, cumulative=False, log_scale=False)
    plt.show()
    display(HTML(f'<h2>Histogram - cumulative ({plt_data})'))
    sns.histplot(rel_data, bins=100, cumulative=True,  log_scale=False)
    plt.show()
    display(HTML(f'<h2>Histogram - topic weighted ({plt_data})'))
    if not plt_data == 'answer_status':
        for topic in range(NR_TOPICS):
            rel_data_2 = base[f'w_topic_{topic}']*rel_data.astype('float')
            display(HTML(f'<h3>Histogram topic [{plt_data}, topic={topic}]</h3>'))
            sns.histplot(rel_data_2, bins=100, cumulative=False, log_scale=False)
            plt.show()
            display(HTML(f'<h3>Histogram topic - cumulative [{plt_data}, topic={topic}]</h3>'))
            sns.histplot(rel_data_2, bins=100, cumulative=True, log_scale=False)
            plt.show()

In [None]:
sns.pairplot(base[[f'w_topic_{i}' for i in range(NR_TOPICS)]])

## Create tag-level information




In [None]:
class NormTopMapper:
    
    def __init__(self, comp_series, topic_nr, max_topics):
        self.comp_series = comp_series
        self.topic_nr = topic_nr
        self.max_topics = max_topics
        
    def __call__(self, row):
        lbl=f'total_weight_topic_{self.topic_nr}'
        norm = sum([row[f'total_weight_topic_{i}']/self.comp_series[f'total_weight_topic_{i}'] for i in range(self.max_topics)])
        return row[lbl]/(self.comp_series[lbl]*norm)
    

@dataclass
class BaseFrameData:
    tags_data: pd.DataFrame
    main_tag_data: pd.Series
    main_tag_label: str
    topic_nr: int
    
    
def get_tags_frame_base(topic_nr, tag_label):  
    label_cnt_dict=defaultdict(lambda: {'total_cnt': 0, **{f'total_weight_topic_{i}': 0 for i in range(topic_nr)}})

    for idx, row in base.iterrows():
        for lbl in row['tags']:
            label_cnt_dict[lbl]['total_cnt']+=1
            for i in range(topic_nr):
                label_cnt_dict[lbl][f'total_weight_topic_{i}']+=row[f'w_topic_{i}']

    for key, value in label_cnt_dict.items():
        label_cnt_dict[key]['tag']=key

    tags_frame=pd.DataFrame.from_records([res for res in label_cnt_dict.values()])
    comp_weights = tags_frame[tags_frame['tag']==tag_label]
    
    return BaseFrameData(tags_data=tags_frame[tags_frame['tag']!=tag_label],
                         main_tag_data=comp_weights,
                         main_tag_label=tag_label,
                         topic_nr=topic_nr
                         )

def extend_base_frame(base_data: BaseFrameData):
    tags_frame=base_data.tags_data
    for i in range(base_data.topic_nr):
        tags_frame[f'normed_weight_{i}'] = tags_frame.apply(NormTopMapper(base_data.main_tag_data, i, base_data.topic_nr), 
                                                            axis=1)
    lbl_lst=[f'normed_weight_{i}' for i in range(base_data.topic_nr)]
    tags_frame['weight_entropy']=tags_frame.apply(DFEntropyMapper('normed_weight_{nr}', base_data.topic_nr), axis=1)
    tags_frame['max_normed_topic_weight']=tags_frame[lbl_lst].max(axis=1)
    tags_frame['dominant_normed_topic']=tags_frame[lbl_lst].apply(lambda row: row.argmax(), axis=1)
    
    return BaseFrameData(tags_data=tags_frame,
                         main_tag_data=base_data.main_tag_data,
                         main_tag_label=base_data.main_tag_label,
                         topic_nr=base_data.topic_nr
                         )


    
tag_level_data = get_tags_frame_base(topic_nr=NR_TOPICS, 
                                     tag_label=TAG_LABEL)

tag_level_data = extend_base_frame(tag_level_data)

In [None]:
LAMBDA_VAL=0.3
display(HTML('<h1>Most relevant tags for topics</h1>'))

def extend_by_scores(tag_level_data, lambda_val):
    tags_frame=tag_level_data.tags_data
    doc_cnt=tag_level_data.main_tag_data['total_cnt'].values[0]
    for i in range(tag_level_data.topic_nr):
        tags_frame[f'score_{i}']=lambda_val*tags_frame[f'normed_weight_{i}'] + (1-lambda_val)*tags_frame['total_cnt']/doc_cnt
    return tags_frame

tags_frame=extend_by_scores(tag_level_data, lambda_val=LAMBDA_VAL)
doc_cnt=tag_level_data.main_tag_data['total_cnt'].values[0]
tags_frame['total_count_rel'] = tags_frame['total_cnt']/doc_cnt


for i in range(NR_TOPICS):
    prt_data = tags_frame.sort_values(f'score_{i}', ascending=False).iloc[:10,:]
    display(HTML(f'<h2>Topic: {i}</h2>'))
    display(prt_data[['tag', 'weight_entropy', 'max_normed_topic_weight', 'dominant_normed_topic', 'total_cnt', f'score_{i}']])
    
    sns.scatterplot(data=prt_data, x=f'normed_weight_{i}', y='total_count_rel', hue='tag')
    plt.show()

In [None]:
hist_labels=['total_cnt', 'weight_entropy', 'max_normed_topic_weight', 'dominant_normed_topic']
hist_labels_templates=['total_weight_topic_{nr}', 'normed_weight_{nr}']
tags_frame=tag_level_data.tags_data

display(HTML('<h1>Histograms for tag level data data</h1>'))

for plt_data in hist_labels:
    rel_data = tags_frame[plt_data]
    display(HTML(f'<h2>Histogram ({plt_data})'))
    sns.histplot(rel_data, bins=100, cumulative=False, log_scale=False)
    plt.show()
    display(HTML(f'<h2>Histogram - cumulative ({plt_data})'))
    sns.histplot(rel_data, bins=100, cumulative=True,  log_scale=False)
    plt.show()
    
for topic_nr in range(NR_TOPICS):
    display(HTML(f'<h2>Topic: ({topic_nr})'))
    for plt_data_template in hist_labels_templates:
        plt_data=plt_data_template.format(nr=topic_nr)
        rel_data = tags_frame[plt_data]
        display(HTML(f'<h2>Histogram (topic "{topic_nr}", {plt_data})'))
        sns.histplot(rel_data, bins=100, cumulative=False, log_scale=False)
        plt.show()
        display(HTML(f'<h2>Histogram - cumulative (topic "{topic_nr}", {plt_data})'))
        sns.histplot(rel_data, bins=100, cumulative=True,  log_scale=False)
        plt.show()

In [None]:
tags_frame=tag_level_data.tags_data
sns.pairplot(tags_frame[[f'normed_weight_{i}' for i in range(NR_TOPICS)]])

In [None]:
tags_frame=tag_level_data.tags_data
tags_frame.columns

In [None]:
@dataclass
class Link:
    content: str
    address: str
        

@dataclass
class LinkAnalysisResult:
    link_dict: DefaultDict[str, DefaultDict[str, float]]  = field(default_factory=lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
    link_to_hit_docs: DefaultDict[str, Set] = field(default_factory=lambda: defaultdict(set))
    link_to_linktype: Dict[str, str] = field(default_factory=dict)
    link_data_frame: pd.DataFrame = field(default_factory=pd.DataFrame)


def extr_links(txt):
    #soup = BeautifulSoup(f'<!DOCTYPE html><html><head></head><body>{txt}</body></html>', 'html.parser')
    soup = BeautifulSoup(txt)
    res = []
    for item in soup.find_all('a'):
        res.append(Link(item.get_text(), item.get('href', None)))
    return res
    
    
def make_list(item):
    if isinstance(item, list):
        ret=[]
        for strval in item:
            ret += extr_links(strval)
        return ret
    elif item is None or item == '':
        return []
    else:
        return extr_links(item)

    
def check_endswith(what: str, lst: List[str]):
    for item in lst:
        if what.endswith(item):
            return True
    else:
        return False
    

def get_tuple(key, value, link_to_linktype, link_to_hit_docs, nr_topics):
    lbl_lst = ['cnt_total', 'tot_quest_votes', 'tot_views', 'tot_answ_votes']
    ret_lst = [key] + [value[lbl] for lbl in lbl_lst] + [value[f'w_topic_{i}'] for i in range(nr_topics)] + [link_to_linktype[key], link_to_hit_docs[key]]
    return tuple(ret_lst)

    
def get_link_report(deps, base, step_3_label, nr_topics):
    link_dict = defaultdict(lambda: defaultdict(lambda: 0.0))
    link_to_hit_docs = defaultdict(set)
    link_to_linktype = dict()
    i=0
    IMG_END_LST = ['.gif', '.bmp', '.png', '.svg', '.jpg', '.tif', '.tiff', '.ico', '.jpeg']
    doc_iterator = data_access.get_doc_iterator(connection=deps.conn,
                                                d2es_obj=deps.d2es,
                                                step_label=step_3_label,
                                                format = 'all_#3',
                                                ml_tags=None
                    )
    for item in doc_iterator:
        i+=1   
        links = make_list(item.question_links) + make_list(item.comment_link_lst)
        for answ in item.answers:
            new_entries = make_list(answ.answer_links) + make_list(answ.comment_link_lst)
            for link in new_entries:
                la = link.address.lower()
                link_dict[la]['tot_answ_votes'] += answ.answer_vote or 0
            links += new_entries
        for link in links:
            la = link.address.lower()
            link_dict[la]['cnt_total'] += 1
            link_dict[la]['tot_quest_votes'] += base.iloc[item.ord_key,:].loc['votes'] or 0
            link_dict[la]['tot_views'] += base.iloc[item.ord_key,:].loc['views'] or 0  
            for i in range(nr_topics):
                link_dict[link.address.lower()][f'w_topic_{i}'] += base.iloc[item.ord_key,:].loc[f'w_topic_{i}']
            link_to_hit_docs[la].add(item.post_id)
            if check_endswith(la, IMG_END_LST):
                link_to_linktype[la] = 'img'
            elif la.startswith('/') or '/stackoverflow.com' in la:
                link_to_linktype[la] = 'internal'
            else:
                link_to_linktype[la] = 'std'   
                
    lbl_lst = ['address', 'count', 'question_votes_total', 'views_total', 'answers_votes_total'] + [f'w_topic_{i}' for i in range(nr_topics)] + ['link type', 'ref_post_ids']




    link_data = pd.DataFrame.from_records([get_tuple(key, value, link_to_linktype, link_to_hit_docs, nr_topics=nr_topics) 
                                           for key, value 
                                           in link_dict.items()], columns=lbl_lst)
    
    for int_lbl in ['count', 'question_votes_total', 'views_total', 'answers_votes_total']:
        link_data[int_lbl]=link_data[int_lbl].map(int)
                
    return LinkAnalysisResult(link_dict = link_dict,
                              link_to_hit_docs = link_to_hit_docs,
                              link_to_linktype = link_to_linktype,
                              link_data_frame = link_data.sort_values('count', ascending=False)
                              )

    

base = get_ext_post_meta_data(step_1_label=all_steps.loc['#1']['step_label'],
                              step_7_label=all_steps.loc['#7']['step_label'],
                              nr_topics=10)

link_report = get_link_report(deps, base, step_3_label=all_steps.loc['#3']['step_label'], nr_topics=10)


display(HTML('<h1>Anzahl der verschiedenen Links</h1>'))
print(len(link_report.link_dict)) 

In [None]:
doc_iterator = data_access.get_doc_iterator(connection=deps.conn,
                                            d2es_obj=deps.d2es,
                                            step_label=all_steps.loc['#3']['step_label'],
                                            format = 'all_#3',
                                            ml_tags=None
)

link_dict = defaultdict(lambda: defaultdict(lambda: 0.0))
link_to_hit_docs = defaultdict(set)
link_to_linktype = dict()

@dataclass
class Link:
    content: str
    address: str


def extr_links(txt):
    #soup = BeautifulSoup(f'<!DOCTYPE html><html><head></head><body>{txt}</body></html>', 'html.parser')
    soup = BeautifulSoup(txt)
    res = []
    for item in soup.find_all('a'):
        res.append(Link(item.get_text(), item.get('href', None)))
    return res
    
def make_list(item):
    if isinstance(item, list):
        ret=[]
        for strval in item:
            ret += extr_links(strval)
        return ret
    elif item is None or item == '':
        return []
    else:
        return extr_links(item)

i=0
for item in doc_iterator:
    i+=1   
    links = make_list(item.question_links) + make_list(item.comment_link_lst)
    for answ in item.answers:
        links += make_list(answ.answer_links) + make_list(answ.comment_link_lst)
    for link in links:
        la = link.address.lower()
        link_dict[la]['cnt_total'] += 1
        for i in range(10):
            link_dict[la][f'w_topic_{i}'] += base.iloc[item.ord_key,:].loc[f'w_topic_{i}']
        link_to_hit_docs[la].add(item.post_id)
        if la.endswith('.gif') or la.endswith('.png') or la.endswith('.svg') or \
           la.endswith('.bmp') or la.endswith('.jpg') or la.endswith('.tif'):
            link_to_linktype[la] = 'img'
        elif la.startswith('/') or '/stackoverflow.com' in la:
            link_to_linktype[la] = 'internal'
        else:
            link_to_linktype[la] = 'std'          


for key in link_dict.keys():
    link_dict[key]['link_target']=key
    link_dict[key]['link_type']=link_to_linktype[key]
    link_dict[key]['doc_hits']=link_to_hit_docs[key]

    
link_data = pd.DataFrame.from_records([value
                                       for value 
                                       in link_dict.values()])

display(HTML('<h1>Link data</h1>'))
display(HTML('<h2>Number of links:</h2>'))
print(len(link_dict))     
display(HTML('<h2>Top link list:</h2>'))
display(link_data.sort_values('cnt_total', ascending=False).head())
display(HTML('<h2>Without images and internal links:</h2>'))
display(link_data[link_data['link_type']=='std'].sort_values('cnt_total', ascending=False).head())


# Export in csv und xls

In [None]:
# raise NotImplementedError()
base.to_csv(os.path.join(BASE_OUTPUT, 'meta_data.csv'), sep=';')
link_data.to_csv(os.path.join(BASE_OUTPUT, 'link_infos.csv'), sep=';')
base.to_excel(os.path.join(BASE_OUTPUT, 'meta_data.xlsx'))
link_data.to_excel(os.path.join(BASE_OUTPUT, 'link_infos.xlsx'))

# Link Verteilung

In [None]:
rel_data = link_data[link_data['link_type']=='std']['cnt_total']
display(HTML('<h1>Histogram</h1>'))
sns.histplot(rel_data, bins=100, cumulative=False, log_scale=False)
plt.show()
display(HTML('<h1>Histogram - cumulative</h1>'))
sns.histplot(rel_data, bins=100, cumulative=True,  log_scale=True)
plt.show()
print()
print()
display(HTML('<h1>Histogram of Number weighted by topic weights</h1>'))
for topic in range(10):
    rel_data_2 = link_data[link_data['link_type']=='std'][f'w_topic_{topic}']
    display(HTML(f'<h2>Histogram topic [{topic}]</h2>'))
    sns.histplot(rel_data_2, bins=100, cumulative=False, log_scale=False)
    plt.show()
    display(HTML(f'<h2>Histogram topic [{topic}] - cumulative</h2>'))
    sns.histplot(rel_data_2, bins=100, cumulative=True, log_scale=False)
    plt.show()

# Temportal post Distribution

In [None]:
base_filtered = base[base['asked_date'].map(lambda x: isinstance(x, type(date.today())))]

In [None]:
bin_nr = 40

time_delta = (base_filtered['asked_date'].max()-base_filtered['asked_date'].min())/bin_nr

bins = [base_filtered['asked_date'].min()+i*time_delta for i in range(bin_nr+1)]

In [None]:
def get_bin(value):
    for i in range(bin_nr):
        if bins[i] < value and value<= bins[i+1]:
            return f'[{i+1}] {bins[i]}-{bins[i+1]}'
    else:
        return f'[{0}] <={bins[0]}'
    
base_filtered['date_nr_bins'] = base_filtered['asked_date'].map(get_bin)

gr_data = base_filtered.groupby('date_nr_bins')['step'].count()
gr_data_weights = base_filtered.groupby('date_nr_bins')[[f'w_topic_{topic}' for topic in range(10)]].sum()
topics_wighted_by_total_weight = gr_data_weights.div(gr_data_weights.sum(axis=1),axis=0)

In [None]:
display(HTML(f'<h1>number of post for time intervals (len: {time_delta.days} days)</h1>'))
gr_data.plot(kind='bar', figsize=(20, 8))

In [None]:
display(HTML('<h1>sum of number of posts weighted by topic weights</h1>'))
gr_data_weights.plot(kind='bar', stacked=True, figsize=(20, 8))

In [None]:
display(HTML('<h1>sum of number of posts weighted by topic weights / normalized</h1>'))
topics_wighted_by_total_weight.plot(kind='bar', stacked=True, figsize=(20, 8))