STM_bitermplus_Tutorial
========

Model fitting
-------------

Here is a simple example of model fitting.
It is supposed that you have already gone through the preprocessing
stage: cleaned, lemmatized or stemmed your documents, and removed stop words.

.. code-block:: python

In [None]:
#!pip install bitermplus
import bitermplus as btm
import numpy as np
import pandas as pd

In [None]:
# Importing data
df = pd.read_csv('All interviewees responses and the corresponding representative words_withoutNONE_updatedV14.csv')
print(df.columns)

In [None]:
df= df[['ids', 'diagnosis', 'section', 'pilot_number', 'inter_time', 'response',
       'response_lemar', 'new_response', 'Topic most represented',
       'Topic and its most representative words']]

In [None]:
print(df.shape)

In [None]:
df=df.dropna(subset='new_response')
print(df.shape)

In [None]:
# df.to_csv('All interviewees responses and the corresponding representative words_withoutNONE_updatedV11.csv',index=False)

In [None]:
df['index'] = pd.DataFrame(range(df.shape[0]))

In [None]:
texts = df['new_response'].str.strip().tolist()

In [None]:
import nltk
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
# new_stopwords = ["um", "oh", "okay", "mhm","ah","uh","yes","get",'yep',"yeah","no","hm","wow",'.','?','-','--',':','mm','Mm','Oop']
# stopwords.extend(new_stopwords)

In [None]:
# Vectorizing documents, obtaining full vocabulary and biterms
# Internally, btm.get_words_freqs uses CountVectorizer from sklearn
# You can pass any of its arguments to btm.get_words_freqs
# For example, you can remove stop words:
# stop_words = stopwords
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

In [None]:
# Initializing and running model
# Optimal model selection
# Calculating metrics¶
# To calculate perplexity, we must provide documents vs topics probability matrix (p_zd) 
# that we calculated at the previous step.
   
metrics=[]
for topic_num in range(2,15):
    print(topic_num)
    model = btm.BTM(X, vocabulary, seed=12321, T=topic_num, M=20, alpha=50/10, beta=0.01)
    model.fit(biterms, iterations=20)
    # Get a phi matrix
    #phi = tmp.get_phi(model)
    #entropy = tmp.entropy(phi)
    perplexity = model.perplexity_
    coherence = model.coherence_
    metrics.append([topic_num,perplexity,coherence])
metrics=pd.DataFrame(metrics)
metrics.columns=['Topic number', 'perplexity','coherence']
metrics['coherence mean']=[np.mean(x) for x in metrics['coherence'].tolist()]
  

##  Choosing the Best Coherence Score
There is no one way to determine whether the coherence score is good or bad. The score and its value depend on the data that it’s calculated from. For instance, in one case, the score of 0.5 might be good enough but in another case not acceptable. The only rule is that we want to maximize this score.

Usually, the coherence score will increase with the increase in the number of topics. This increase will become smaller as the number of topics gets higher. The trade-off between the number of topics and coherence score can be achieved using the so-called elbow technique. The method implies plotting coherence score as a function of the number of topics. We use the elbow of the curve to select the number of topics.

The idea behind this method is that we want to choose a point after which the diminishing increase of coherence score is no longer worth the additional increase of the number of topics. The example of elbow cutoff at n\_topics = 3 is shown below:

In [None]:
display(pd.DataFrame(metrics)) 

In [None]:
import tmplot as tmp
optimal_topic_number=10
model = btm.BTM(X, vocabulary, seed=1234, T=optimal_topic_number, M=20)
model.fit(biterms, iterations=500)
# Get a phi matrix
phi = tmp.get_phi(model)
entropy = tmp.entropy(phi)

# Inference
# ---------

# Now, we will calculate documents vs topics probability matrix (make an inference).

# .. code-block:: python

p_zd = model.transform(docs_vec)

In [None]:
# Find the best topic for each document
best_topics = [np.argmax(doc_topics) for doc_topics in p_zd]

# Print the best topic for each document
new_file = open('best_topic_for_each_doc.csv','wt')
for i, topic in enumerate(best_topics):
     #print(f"Document {i}: Topic {topic}")
    new_file.write(str(i)+','+str(topic)+'\n')
print(model.matrix_words_topics_)

In [None]:
import tmplot as tmp

# Train or import a trained model here

model = btm.BTM(X, vocabulary, seed=1234, T=10, M=20) #, alpha=50/10, beta=0.01
model.fit(biterms, iterations=500)
    
# Get a phi matrix
phi = tmp.get_phi(model)
entropy = tmp.entropy(phi)
print(entropy)


theta = tmp.get_theta(model)
print(theta)

# Calculate terms probabilities
# Do not forget to pass topic id with `topic` argument
terms_probs = tmp.calc_terms_probs_ratio(phi, topic=0, lambda_=0.9)
print(terms_probs)


In [None]:
#Print the top 15 words for each topic
print(len(model.labels_))
labels = pd.DataFrame(model.labels_)
new_id = pd.DataFrame(range(len(model.labels_)))
df_with_label=pd.DataFrame()
df_with_label['index']=new_id
df_with_label['topic_label']=labels

In [None]:
new_df_with_label = pd.merge(df,df_with_label,on='index')
print(new_df_with_label.shape)

In [None]:
new_df_with_label.to_excel('All interviewees responses_withoutNONE_updatedV17_with_BTM.xlsx',index=False)

In [None]:
new_df_with_label.to_csv('All interviewees responses_withoutNONE_updatedV17_with_BTM.csv',index=False)

In [None]:
display(new_df_with_label.head())

In [None]:
#!pip install tmplot
import tmplot as tmp

# Run the interactive report interface
tmp.report(model=model, docs=texts)

In [None]:
# Plotting topics as a scatter plot
topics_coords = tmp.prepare_coords(model)
tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label')

In [None]:
# Plotting terms probabilities

nf =open('topic_wordsV3.csv','wt')
nf.write('topic,reprensentative_words'+'\n')
for i in range(10):
    terms_probs = tmp.calc_terms_probs_ratio(phi, topic=i, lambda_=1)
    #print( terms_probs['Terms'][0:30])
    words = ' '.join(row for row in terms_probs['Terms'][0:20])
    print(i,words)
    nf.write(str(i)+','+str(words)+'\n')
    #terms_probs.to_csv('Topic_'+str(i)+'terms and probability.csv',index=False)
    #tmp.plot_terms(terms_probs)


In [None]:
topic_words_df = pd.read_csv('topic_wordsV3.csv')
print(topic_words_df.shape)
print(topic_words_df.columns)
topic_words_df. rename(columns = {'topic':'topic_label'}, inplace = True)
print(topic_words_df.columns)

In [None]:
new_df_with_label.columns

In [None]:
df_with_label_topicwords = pd.merge(new_df_with_label,topic_words_df,on='topic_label',how='left')
print(df_with_label_topicwords.head())
print(df_with_label_topicwords.shape)

In [None]:
df_with_label_topicwords.to_excel('All interviewees responses_withoutNONE_updatedV14_with_BTM_b.xlsx',index=False)

In [None]:
counts_df = (df_with_label_topicwords[['ids','topic_label']]).groupby('topic_label').count()
display(counts_df)