Latent Dirichlet allocation (LDA) is a widely discussed topic modeling algorithm. According to David M. Blei's paper "Probabilistic Topic Models", in the domain of LDA, a topic is defined as "a distribution over a fixed vocabulary" and the key idea behind LDA is that documents exhibit multiple topics. LDA is a statistical topic model of document collections. 

My experiments for this project would mostly concentrate on LDA algorithm. 

Computing library used: sklearn (interoperate with NumPy and SciPy)

In the newest version of sklearn, class "LatentDirichletAllocation" is added, which performs Latent Dirichlet Allocation with online variational Bayes algorithm on given data matrix.

Below is source code for my first experiment:

In [None]:
from __future__ import print_function
from time import time
import xml.etree.ElementTree as ET
import os,sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import json

n_distributions = 100 #number of distributions
n_top_words = 20 #number of terms extracted in each distribution 

#helper function
def print_feature_names(feature_names):
    for f in feature_names:
        print(f)

#helper function to find term index in given vocabulary        
def find_term_idx(term,feature_names):
    for idx, f in enumerate(feature_names):
        if(f==term):
            return idx
    
#helper function to print most commonly co-occurred terms (with score) in each distribution    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([str(topic[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

#helper function to find the distribution in which the target term is mostly "related" 
#and get the top terms in that distribution
def find_most_relevant(model, feature_names, n_top_words, target_term_idx):
    max_score = 0.0
    max_idx = -1
    
    for topic_idx, topic in enumerate(model.components_):
        if(topic[target_term_idx]>max_score):
            max_score = topic[target_term_idx]
            max_idx = topic_idx
            
    max_topic = model.components_[max_idx]
    #relevant_terms = " ".join([feature_names[i] for i in max_topic.argsort()[:-n_top_words - 1:-1]])
    #print("Most Relevant Topic #%d:%s" % (max_idx,relevant_terms))
    relevant_terms = set()
    for i in max_topic.argsort()[:-n_top_words - 1:-1]:
        relevant_terms.add(feature_names[i])
    return relevant_terms

#helper function to convert a set object to list that can be directly json encoded   
def set_convert_to_json_list(terms_set):
    json_term_list = list()
    for term in terms_set:
        json_term_list.append({"name":term})
    return json_term_list

#helper function to convert a dict object to list that can be directly json encoded
def dict_convert_to_json_list(terms_dict):
    json_term_list = list()
    for key in terms_dict:
        json_term_list.append({"name":key, "children":set_convert_to_json_list(terms_dict[key])})
    return json_term_list
    

#exit if no user term is provided    
if(len(sys.argv)!=2):
    print ("Invalid input...exit")
    exit()
    
user_term = sys.argv[1]

#load the PMC dataset
print("Loading dataset...")
t0 = time()

data_samples = list()
for folder, dirs, files in os.walk('test/'):
    for f in files:
        if f.endswith('.nxml'):
            tree = ET.parse(os.path.join(folder,f))
            root = tree.getroot()
            data_samples.append(' '.join(root.itertext())) #each document is represented by text content from that xml file
            
print("done in %0.3fs." % (time() - t0))
print("%d lines read" % len(data_samples))

#use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
#max_df=0.95: when building the vocabulary ignore terms that have a document frequency strictly > 95% of documents
#min_df=2: when building the vocabulary ignore terms that have a document frequency strictly < 2 
#stop_words='english': a built-in stop word list for English is used
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tf features")
#learning_method='online': method used to update _component
#in general, if the data size is large, the online update will be much faster than the batch update
#'online': Online variational Bayes method.

#learning_offset=50.:learning rate, a (positive) parameter that downweights early iterations in online learning
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))


tf_feature_names = tf_vectorizer.get_feature_names()
#for debugging purposes
#print("\nFeature Names in LDA model:")
#print_feature_names(tf_feature_names)
#print("\nTopics in LDA model:")
#print_top_words(lda, tf_feature_names, n_top_words)


#to extract terms(topics) that most commonly co-occurred with the user provided term (categorized in 2 levels)

#set containing terms already shown in previous levels
shown_terms = set()
shown_terms.add(user_term)
user_term_idx = find_term_idx(user_term,tf_feature_names)
#l1_relevant_terms meaning the level 1 topics that are mostly commonly co-occurred
l1_relevant_terms = find_most_relevant(lda, tf_feature_names, n_top_words, user_term_idx)
l1_relevant_terms = l1_relevant_terms.difference(shown_terms)
#update shown_terms set for l2_relevant_terms
shown_terms = shown_terms.union(l1_relevant_terms)
l2_relevant_terms = dict()
#level 2 topics are grouped by level 1 topics as keys
for term in l1_relevant_terms:
    term_idx = find_term_idx(term,tf_feature_names)
    l2_relevant_terms[term] = find_most_relevant(lda, tf_feature_names, n_top_words, term_idx)
    l2_relevant_terms[term] = l2_relevant_terms[term].difference(shown_terms)
    
#convert obj to json file
model = dict()
model["name"] = user_term
model["children"] = dict_convert_to_json_list(l2_relevant_terms)


output = open('flare.json','w')
output.write(json.dumps(model,indent=1)) # python will convert \n to os.linesep
output.close()


As can be seen from the code, in the first experiment:
1. documents are represented by bag of words model
2. text corpus is preprocessed using token counts metric after removing English stop words, ignoring terms occuring in >95% documents and only focusing on terms occuring in at least 2 documents
3. LDA model is built upon feature matrix with top n distribution (in LDA language, a distribution is a topic) extracted
4. for each target term, terms most commonly co-occurred with it are identified as those that are most frequent in the distribution in which the target term is most frequent (as compared to in other distributions in the same corpus)

![title](img/v1_10.png)

model visualization, n_distributions = 10, user provided term as "chemical"

![title](img/v1_100.png)

model visualization, n_distributions = 100

In the feature extraction (preprocessing) stage, experiment 1 uses tf (token counts). To balance the weights of terms with different frequencies, experiment 2 hence extracts the tfidf features from the documents collection. Tfidf incorporates an inverse document frequency factor which diminishes the weight of terms that occur very frequently in the document set and increases the weight of terms that occur rarely. Everything else in experiment 1 remain unchanged.

In [None]:
from __future__ import print_function
from time import time
import xml.etree.ElementTree as ET
import os,sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import json

n_topics = 10 #number of distributions
n_top_words = 20 #number of terms extracted in each distribution 

#helper function
def print_feature_names(feature_names):
    for f in feature_names:
        print(f)

#helper function to find term index in given vocabulary        
def find_term_idx(term,feature_names):
    for idx, f in enumerate(feature_names):
        if(f==term):
            return idx
    
#helper function to print most commonly co-occurred terms (with score) in each distribution    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([str(topic[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

#helper function to find the distribution in which the target term is mostly "related" 
#and get the top terms in that distribution
def find_most_relevant(model, feature_names, n_top_words, target_term_idx):
    max_score = 0.0
    max_idx = -1
    
    for topic_idx, topic in enumerate(model.components_):
        if(topic[target_term_idx]>max_score):
            max_score = topic[target_term_idx]
            max_idx = topic_idx
            
    max_topic = model.components_[max_idx]
    #relevant_terms = " ".join([feature_names[i] for i in max_topic.argsort()[:-n_top_words - 1:-1]])
    #print("Most Relevant Topic #%d:%s" % (max_idx,relevant_terms))
    relevant_terms = set()
    for i in max_topic.argsort()[:-n_top_words - 1:-1]:
        relevant_terms.add(feature_names[i])
    return relevant_terms

#helper function to convert a set object to list that can be directly json encoded   
def set_convert_to_json_list(terms_set):
    json_term_list = list()
    for term in terms_set:
        json_term_list.append({"name":term})
    return json_term_list

#helper function to convert a dict object to list that can be directly json encoded
def dict_convert_to_json_list(terms_dict):
    json_term_list = list()
    for key in terms_dict:
        json_term_list.append({"name":key, "children":set_convert_to_json_list(terms_dict[key])})
    return json_term_list
    

#exit if no user term is provided    
if(len(sys.argv)!=2):
    print ("Invalid input...exit")
    exit()
    
user_term = sys.argv[1]

#load the PMC dataset
print("Loading dataset...")
t0 = time()

data_samples = list()
for folder, dirs, files in os.walk('../data/'):
    for f in files:
        if f.endswith('.nxml'):
            tree = ET.parse(os.path.join(folder,f))
            root = tree.getroot()
            data_samples.append(' '.join(root.itertext())) #each document is represented by text content from that xml file
            
print("done in %0.3fs." % (time() - t0))
print("%d lines read" % len(data_samples))

#use tf (raw term count) features for LDA.
print("Extracting tfidf features for LDA...")
#max_df=0.95: when building the vocabulary ignore terms that have a document frequency strictly > 95% of documents
#min_df=2: when building the vocabulary ignore terms that have a document frequency strictly < 2 
#stop_words='english': a built-in stop word list for English is used
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tfidf features")
#learning_method='online': method used to update _component
#in general, if the data size is large, the online update will be much faster than the batch update
#'online': Online variational Bayes method.

#learning_offset=50.:learning rate, a (positive) parameter that downweights early iterations in online learning
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tf_feature_names = tfidf_vectorizer.get_feature_names()
#for debugging purposes
#print("\nFeature Names in LDA model:")
#print_feature_names(tf_feature_names)
print("\nTopics in LDA model:")
print_top_words(lda, tf_feature_names, n_top_words)


#to extract terms(topics) that most commonly co-occurred with the user provided term (categorized in 2 levels)

#set containing terms already shown in previous levels
shown_terms = set()
shown_terms.add(user_term)
user_term_idx = find_term_idx(user_term,tf_feature_names)
#l1_relevant_terms meaning the level 1 topics that are mostly commonly co-occurred
l1_relevant_terms = find_most_relevant(lda, tf_feature_names, n_top_words, user_term_idx)
l1_relevant_terms = l1_relevant_terms.difference(shown_terms)
#update shown_terms set for l2_relevant_terms
shown_terms = shown_terms.union(l1_relevant_terms)
l2_relevant_terms = dict()
#level 2 topics are grouped by level 1 topics as keys
for term in l1_relevant_terms:
    term_idx = find_term_idx(term,tf_feature_names)
    l2_relevant_terms[term] = find_most_relevant(lda, tf_feature_names, n_top_words, term_idx)
    l2_relevant_terms[term] = l2_relevant_terms[term].difference(shown_terms)
    
#convert obj to json file
model = dict()
model["name"] = user_term
model["children"] = dict_convert_to_json_list(l2_relevant_terms)


output = open('../output/flare_v2_10.json','w')
output.write(json.dumps(model,indent=1)) # python will convert \n to os.linesep
output.close()


![title](img/v2_10.png)

model visualization, n_distributions = 10

Tthe above visualization indicates that all level 2 ditributions overlap with the level 1 distribution (the distrbution extracted for the user provided term). Based on this observation, I update the code to ignore level 1 distribution when extracting level 2 distributions in experiment 3.

In [None]:
from __future__ import print_function
from time import time
import xml.etree.ElementTree as ET
import os,sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import json

n_topics = 10 #number of distributions
n_top_words = 20 #number of terms extracted in each distribution 

#helper function
def print_feature_names(feature_names):
    for f in feature_names:
        print(f)

#helper function to find term index in given vocabulary        
def find_term_idx(term,feature_names):
    for idx, f in enumerate(feature_names):
        if(f==term):
            return idx
    
#helper function to print most commonly co-occurred terms (with score) in each distribution    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([str(topic[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

#helper function to find the distribution in which the target term is mostly "related" 
#and get the top terms in that distribution
def find_most_relevant(model, target_term_idx, ignore_idx):
    max_score = 0.0
    max_idx = -1
    
    for topic_idx, topic in enumerate(model.components_):
        if(topic[target_term_idx]>max_score and topic_idx != ignore_idx):
            max_score = topic[target_term_idx]
            max_idx = topic_idx
    
    return max_idx


def get_top_terms(model, feature_names, n_top_words, topic_idx):
    max_topic = model.components_[topic_idx]
    #relevant_terms = " ".join([feature_names[i] for i in max_topic.argsort()[:-n_top_words - 1:-1]])
    #print("Most Relevant Topic #%d:%s" % (max_idx,relevant_terms))
    relevant_terms = set()
    for i in max_topic.argsort()[:-n_top_words - 1:-1]:
        relevant_terms.add(feature_names[i])
    return relevant_terms
    
#helper function to convert a set object to list that can be directly json encoded   
def set_convert_to_json_list(terms_set):
    json_term_list = list()
    for term in terms_set:
        json_term_list.append({"name":term})
    return json_term_list

#helper function to convert a dict object to list that can be directly json encoded
def dict_convert_to_json_list(terms_dict):
    json_term_list = list()
    for key in terms_dict:
        json_term_list.append({"name":key, "children":set_convert_to_json_list(terms_dict[key])})
    return json_term_list
    

#exit if no user term is provided    
if(len(sys.argv)!=2):
    print ("Invalid input...exit")
    exit()
    
user_term = sys.argv[1]

#load the PMC dataset
print("Loading dataset...")
t0 = time()

data_samples = list()
for folder, dirs, files in os.walk('../data/'):
    for f in files:
        if f.endswith('.nxml'):
            tree = ET.parse(os.path.join(folder,f))
            root = tree.getroot()
            data_samples.append(' '.join(root.itertext())) #each document is represented by text content from that xml file
            
print("done in %0.3fs." % (time() - t0))
print("%d lines read" % len(data_samples))

#use tf (raw term count) features for LDA.
print("Extracting tfidf features for LDA...")
#max_df=0.95: when building the vocabulary ignore terms that have a document frequency strictly > 95% of documents
#min_df=2: when building the vocabulary ignore terms that have a document frequency strictly < 2 
#stop_words='english': a built-in stop word list for English is used
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tfidf features")
#learning_method='online': method used to update _component
#in general, if the data size is large, the online update will be much faster than the batch update
#'online': Online variational Bayes method.

#learning_offset=50.:learning rate, a (positive) parameter that downweights early iterations in online learning
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tf_feature_names = tfidf_vectorizer.get_feature_names()
#for debugging purposes
#print("\nFeature Names in LDA model:")
#print_feature_names(tf_feature_names)
print("\nTopics in LDA model:")
print_top_words(lda, tf_feature_names, n_top_words)


#to extract terms(topics) that most commonly co-occurred with the user provided term (categorized in 2 levels)

#set containing terms already shown in previous levels
shown_terms = set()
shown_terms.add(user_term)
user_term_idx = find_term_idx(user_term,tf_feature_names)
#l1_relevant_terms meaning the level 1 topics that are mostly commonly co-occurred
l1_relevant_distribution_idx = find_most_relevant(lda, user_term_idx, -1)
l1_relevant_terms = get_top_terms(lda, tf_feature_names, n_top_words, l1_relevant_distribution_idx)
l1_relevant_terms = l1_relevant_terms.difference(shown_terms)
#update shown_terms set for l2_relevant_terms
shown_terms = shown_terms.union(l1_relevant_terms)
l2_relevant_terms = dict()
#level 2 topics are grouped by level 1 topics as keys
for term in l1_relevant_terms:
    term_idx = find_term_idx(term,tf_feature_names)
    tmp_distribution_idx = find_most_relevant(lda, term_idx, l1_relevant_distribution_idx)
    l2_relevant_terms[term] = get_top_terms(lda, tf_feature_names, n_top_words, tmp_distribution_idx)
    l2_relevant_terms[term] = l2_relevant_terms[term].difference(shown_terms)
    
#convert obj to json file
model = dict()
model["name"] = user_term
model["children"] = dict_convert_to_json_list(l2_relevant_terms)


output = open('../output/flare_v3_10.json','w')
output.write(json.dumps(model,indent=1)) # python will convert \n to os.linesep
output.close()


![title](img/v3_10.png)

model visualization, n_distributions = 10

Using tfidf as compared to tf, it is observed that the model is generally improved. Terms with only numbers (such as 11, 12) are gone in the new model. Terms with only 1-3 characters that are more likely to be abbreviations (such as et, cu) are less frequent in the new model. In each level, more "meaningful" terms are extracted while the overlapping between each pair of them is less. For example, "cell" and "cells" are no longer in level 1, at least simultaneously.

While doing research online, I notice that tfidf preprocessing is also widely used with NMF. 
Non-negative matrix factorization (NMF) is another popular algorithm used in topic modeling field. 
I perform the 4th experiment using NMF as the model.

In [None]:
from __future__ import print_function
from time import time
import xml.etree.ElementTree as ET
import os,sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import json

n_topics = 10 #number of distributions
n_top_words = 20 #number of terms extracted in each distribution 

#helper function
def print_feature_names(feature_names):
    for f in feature_names:
        print(f)

#helper function to find term index in given vocabulary        
def find_term_idx(term,feature_names):
    for idx, f in enumerate(feature_names):
        if(f==term):
            return idx
    
#helper function to print most commonly co-occurred terms (with score) in each distribution    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([str(topic[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

#helper function to find the distribution in which the target term is mostly "related" 
#and get the top terms in that distribution
def find_most_relevant(model, target_term_idx, ignore_idx):
    max_score = 0.0
    max_idx = -1
    
    for topic_idx, topic in enumerate(model.components_):
        if(topic[target_term_idx]>max_score and topic_idx != ignore_idx):
            max_score = topic[target_term_idx]
            max_idx = topic_idx
    
    return max_idx


def get_top_terms(model, feature_names, n_top_words, topic_idx):
    max_topic = model.components_[topic_idx]
    #relevant_terms = " ".join([feature_names[i] for i in max_topic.argsort()[:-n_top_words - 1:-1]])
    #print("Most Relevant Topic #%d:%s" % (max_idx,relevant_terms))
    relevant_terms = set()
    for i in max_topic.argsort()[:-n_top_words - 1:-1]:
        relevant_terms.add(feature_names[i])
    return relevant_terms
    
#helper function to convert a set object to list that can be directly json encoded   
def set_convert_to_json_list(terms_set):
    json_term_list = list()
    for term in terms_set:
        json_term_list.append({"name":term})
    return json_term_list

#helper function to convert a dict object to list that can be directly json encoded
def dict_convert_to_json_list(terms_dict):
    json_term_list = list()
    for key in terms_dict:
        json_term_list.append({"name":key, "children":set_convert_to_json_list(terms_dict[key])})
    return json_term_list
    

#exit if no user term is provided    
if(len(sys.argv)!=2):
    print ("Invalid input...exit")
    exit()
    
user_term = sys.argv[1]

#load the PMC dataset
print("Loading dataset...")
t0 = time()

data_samples = list()
for folder, dirs, files in os.walk('../data/'):
    for f in files:
        if f.endswith('.nxml'):
            tree = ET.parse(os.path.join(folder,f))
            root = tree.getroot()
            data_samples.append(' '.join(root.itertext())) #each document is represented by text content from that xml file
            
print("done in %0.3fs." % (time() - t0))
print("%d lines read" % len(data_samples))

#use tf (raw term count) features for NMF.
print("Extracting tfidf features for NMF...")
#max_df=0.95: when building the vocabulary ignore terms that have a document frequency strictly > 95% of documents
#min_df=2: when building the vocabulary ignore terms that have a document frequency strictly < 2 
#stop_words='english': a built-in stop word list for English is used
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


#fit the NMF model
print("Fitting the NMF model with tf-idf features")
t0 = time()
#alpha=.1: regularization
#l1_ratio=.5: regularization
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))



tf_feature_names = tfidf_vectorizer.get_feature_names()
#for debugging purposes
#print("\nFeature Names in NMF model:")
#print_feature_names(tf_feature_names)
print("\nTopics in NMF model:")
print_top_words(nmf, tf_feature_names, n_top_words)


#to extract terms(topics) that most commonly co-occurred with the user provided term (categorized in 2 levels)

#set containing terms already shown in previous levels
shown_terms = set()
shown_terms.add(user_term)
user_term_idx = find_term_idx(user_term,tf_feature_names)
#l1_relevant_terms meaning the level 1 topics that are mostly commonly co-occurred
l1_relevant_distribution_idx = find_most_relevant(nmf, user_term_idx, -1)
l1_relevant_terms = get_top_terms(nmf, tf_feature_names, n_top_words, l1_relevant_distribution_idx)
l1_relevant_terms = l1_relevant_terms.difference(shown_terms)
#update shown_terms set for l2_relevant_terms
shown_terms = shown_terms.union(l1_relevant_terms)
l2_relevant_terms = dict()
#level 2 topics are grouped by level 1 topics as keys
for term in l1_relevant_terms:
    term_idx = find_term_idx(term,tf_feature_names)
    tmp_distribution_idx = find_most_relevant(nmf, term_idx, l1_relevant_distribution_idx)
    l2_relevant_terms[term] = get_top_terms(nmf, tf_feature_names, n_top_words, tmp_distribution_idx)
    l2_relevant_terms[term] = l2_relevant_terms[term].difference(shown_terms)
    
#convert obj to json file
model = dict()
model["name"] = user_term
model["children"] = dict_convert_to_json_list(l2_relevant_terms)


output = open('../output/flare_v4_10.json','w')
output.write(json.dumps(model,indent=1)) # python will convert \n to os.linesep
output.close()


![title](img/v4_10.png)

model visualization, n_distributions = 10