# Mount the Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!python -m pip install --upgrade pip

In [None]:
!python -m pip install --user spacy==3.1.3

## 1. Import the required libraries

In [None]:
import re
import spacy

In [None]:
!pip install -U wn==0.0.22

In [None]:
!pip show nltk

In [None]:
!python --version

In [None]:
import nltk
nltk.download("popular")

In [None]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pandas

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
with open("/content/drive/MyDrive/Topic_Modelling/Wells_Fargo_transcript.txt","r") as f:
  D1 = f.read()
print(D1)

In [None]:
with open("/content/drive/MyDrive/Topic_Modelling/Cisco_transcript.txt","r") as f:
  D2 = f.read()
print(D2)

In [None]:
with open("/content/drive/MyDrive/Topic_Modelling/Intuit_transcript.txt","r") as f:
  D3 = f.read()
print(D3)

In [None]:
with open("/content/drive/MyDrive/Topic_Modelling/UHG_transcript.txt","r") as f:
  D4 = f.read()
print(D4)

In [None]:
with open("/content/drive/MyDrive/Topic_Modelling/Thermo_Fisher_transcript.txt","r") as f:
  D5 = f.read()
print(D5)

In [None]:
print ('D1: ',D1,'\nD2: ',D2,'\nD3: ',D3,'\nD4: ',D4,'\nD5: ',D5, end = "\n",)

In [None]:
# Combine all the documents into a list:
corpus = [D1, D2, D3, D4, D5]
print( "Corpus: ", corpus)

## 2. Text Preprocessing

In [None]:
stop_loss_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


def clean_data(doc):
  # Convert text into lower case and split into words
  stop_free_word = " ".join([i for i in doc.lower().split() if i not in stop_loss_words])

  # Remove stop words if present
  remove_stop_words = ''.join(ch for ch in stop_free_word if ch not in exclude)  

  # Remove punctuations, symbols and special characters and normalize the text
  normalize_text = " ".join(lemma.lemmatize(word) for word in remove_stop_words.split())  
  return normalize_text

# Clean data is stored in a new list
clean_corpus = [clean_data(doc).split() for doc in corpus]
print("Clean corpus: ", clean_corpus)

## 3. Convert Text into Numerical Representation

In [None]:
# Converting text into numerical representation using tf-idf vectorizer
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False) 
print('TF-IDF Vectorizer: ',tf_idf_vectorizer)
# Converting text into numerical representation using count vectorizer
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
print('Count Vectorizer: ',cv_vectorizer)

In [None]:
# Converting text into numerical representation using tf-idf vectorizer
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False) 
print('TF-IDF Vectorizer: ',tf_idf_vectorizer)

In [None]:
# Array from TF-IDF Vectorizer (Convert to Document-Term Matrix)
tf_idf_array = tf_idf_vectorizer.fit_transform(clean_corpus)
print(tf_idf_array)

In [None]:
tf_idf_array

In [None]:
# Array from Count Vectorizer (Convert to Document-Term Matrix)
cv_array = cv_vectorizer.fit_transform(clean_corpus)
print(cv_array)

In [None]:
cv_array

In [None]:
# Creating vocabulary array from tf-idf
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()
print(vocab_tf_idf)

In [None]:
vocab_tf_idf

In [None]:
# Creating vocabulary array from cv
vocab_cv = cv_vectorizer.get_feature_names()
print(vocab_cv)

In [None]:
vocab_cv

In [None]:
display("Length of vocabulary array using tf_idf: ", len(vocab_tf_idf))
display("Length of vocabulary array using cv: ",len(vocab_cv))

## 4. LDA Algorithm

In [None]:
# Create object for the LDA class 
lda_algorithm_cv = LatentDirichletAllocation(n_components = 20, max_iter = 20, random_state = 20)
print("LDA Algorithm : ",lda_algorithm_cv)
# fit transform on model on our cv_vectorizer
X_topics = lda_algorithm_cv.fit_transform(cv_array)
print("X Topics : ",X_topics)

# .components_ gives us our topic distribution 
topic_words = lda_algorithm_cv.components_
print( 'Topic Words : ', topic_words)

In [None]:
# Create object for the LDA class 
lda_algorithm_tfidf = LatentDirichletAllocation(n_components = 20, max_iter = 20, random_state = 20)
print("LDA Algorithm : ",lda_algorithm_tfidf)
# fit transform on model on our tf_idf_vectorizer
X_topics1 = lda_algorithm_tfidf.fit_transform(tf_idf_array)
print("X Topics : ",X_topics1)

# .components_ gives us our topic distribution 
topic_words1 = lda_algorithm_tfidf.components_
print( 'Topic Words : ', topic_words1)

## 4.1) Retrieve the Topics

In [None]:
# Initialize the number of words 
n_top_words = 10
for i, topic_list in enumerate (topic_words):

  # Sorting an array or a list or the matrix according to their values
  sorted_topic_list = np.argsort(topic_list)

  # View the actual words present in those indexes
  topic_words = np.array(vocab_cv)[sorted_topic_list]

  # topic_words variable contains the Topics and respective words present in those Topics
  topic_words = topic_words[:-n_top_words:-1]

  print ("Topic", str(i+1), topic_words)

In [None]:
# Initialize the number of words 
n_top_words = 10
for i, topic_list in enumerate (topic_words1):

  # Sorting an array or a list or the matrix according to their values
  sorted_topic_list = np.argsort(topic_list)

  # View the actual words present in those indexes
  topic_words1 = np.array(vocab_tf_idf)[sorted_topic_list]

  # topic_words variable contains the Topics and respective words present in those Topics
  topic_words1 = topic_words1[:-n_top_words:-1]

  print ("Topic", str(i+1), topic_words1)

## 4.2) Annotate the Topic documents

In [None]:
document_topic = lda_algorithm_cv.transform(cv_array)

for l in range(document_topic.shape[0]):
  topic_document = document_topic[l].argmax()

  print(" Document ", l+1, " --> Topic : ",topic_document )

In [None]:
document_topic = lda_algorithm_tfidf.transform(tf_idf_array)

for l in range(document_topic.shape[0]):
  topic_document = document_topic[l].argmax()

  print(" Document ", l+1, " --> Topic : ",topic_document )

## 5) Using PyLDAvis for Visualization

In [None]:
!pip install pyLDAvis

In [None]:
from __future__ import print_function

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_cv, cv_array, cv_vectorizer)

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_tfidf, tf_idf_array, tf_idf_vectorizer)

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_cv, cv_array, cv_vectorizer,mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_cv, cv_array, cv_vectorizer,mds='tsne')

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_tfidf, tf_idf_array, tf_idf_vectorizer,mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda_algorithm_tfidf, tf_idf_array, tf_idf_vectorizer,mds='tsne')