In [11]:
!pip install requests
!pip install beautifulsoup4



In [12]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="content_wrapper arti-flow").find_all("p")]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://www.moneycontrol.com/news/business/hdfc-life-insurance-q1-net-profit58_14063101.html',
        'https://www.moneycontrol.com/news/business/capital-raising-committeehdfc-life-to-meetjuly-23-for-proposed-rs-600-crore-fundraising-plan_14062041.html',
        'https://www.moneycontrol.com/news/business/coronavirus-lockdown-life-insurers39-new-premium-down-254may-2020_13881721.html',
        'https://www.moneycontrol.com/news/stocks-views/hot-stocks-%7C-here39shdfc-life-itc-arebuythe-short-term_13656041.html',
        'https://www.moneycontrol.com/news/business/banks-take-charge-as-chief-distribution-officersprivate-life-insurers_13538161.html',
        'https://www.moneycontrol.com/news/business/budget-2020-%7C-removalexemptionsnew-tax-regime-to-impact-life-insurers-mfs_13446461.html',
        'https://www.moneycontrol.com/news/science/plutos-planetary-status-remains-in-question-94-years-after-discovery-12325681.html',
        'https://www.moneycontrol.com/news/technology/how-a-pokemon-fan-created-a-different-experience-on-nintendo-3ds-for-solar-eclipse-2024-12600771.html',
        'https://www.moneycontrol.com/news/technology/solar-eclipse-2024-popular-apps-to-view-and-capture-the-best-photos-12600731.html',
        'https://www.moneycontrol.com/news/technology/spotify-announced-ai-generated-playlists-how-it-works-availability-and-other-details-12600471.html',
        'https://www.moneycontrol.com/news/technology/microsoft-expands-its-presence-in-the-uk-announces-new-ai-hub-in-london-12600311.html',
        'https://www.moneycontrol.com/news/technology/flipkart-expands-travel-portfolio-launches-bus-bookings-on-its-app-12598541.html']

# Comedian names
TOPIC = ['HDFC', 'CAPITAL', 'LOCKDOWN', 'STOCKS', 'BANKS', 'BUDGET', 'PLUTO', 'POKEMON', 'SOLAR', 'SPOTIFY', 'MICROSOFT', 'FLIPKART']

In [13]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://www.moneycontrol.com/news/business/hdfc-life-insurance-q1-net-profit58_14063101.html
https://www.moneycontrol.com/news/business/capital-raising-committeehdfc-life-to-meetjuly-23-for-proposed-rs-600-crore-fundraising-plan_14062041.html
https://www.moneycontrol.com/news/business/coronavirus-lockdown-life-insurers39-new-premium-down-254may-2020_13881721.html
https://www.moneycontrol.com/news/stocks-views/hot-stocks-%7C-here39shdfc-life-itc-arebuythe-short-term_13656041.html
https://www.moneycontrol.com/news/business/banks-take-charge-as-chief-distribution-officersprivate-life-insurers_13538161.html
https://www.moneycontrol.com/news/business/budget-2020-%7C-removalexemptionsnew-tax-regime-to-impact-life-insurers-mfs_13446461.html
https://www.moneycontrol.com/news/science/plutos-planetary-status-remains-in-question-94-years-after-discovery-12325681.html
https://www.moneycontrol.com/news/technology/how-a-pokemon-fan-created-a-different-experience-on-nintendo-3ds-for-solar-eclipse-202

In [14]:
# Make a new directory to hold the text files
!mkdir transcripts

# Pickle files for later use
for i, c in enumerate(TOPIC):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

# Load pickled files
data = {}
for i, c in enumerate(TOPIC):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)


mkdir: cannot create directory ‘transcripts’: File exists


In [15]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['HDFC', 'CAPITAL', 'LOCKDOWN', 'STOCKS', 'BANKS', 'BUDGET', 'PLUTO', 'POKEMON', 'SOLAR', 'SPOTIFY', 'MICROSOFT', 'FLIPKART'])

In [16]:
# More checks
data['SPOTIFY'][:2]

['Popular music streaming platform Spotify has announced a new feature called AI playlist. As the name suggests, users can now create AI-generated playlists. The features is being made available in the UK and Australia for now. Only Spotify Premium subscribers will have access to the feature.',
 'How will the feature work?']

In [17]:
# Let's take a look at our data again
next(iter(data.keys()))

'HDFC'

In [18]:
next(iter(data.values()))

['Private life insurer HDFC Life Insurance posted a 5.8 percent year-on-year (YoY) increase in its June quarter (Q1) consolidated net profit at Rs 450.54 crore.',
 'This was on the back of an improvement in its investment income even as there was a decline in the net premium income.',
 'Vibha Padalkar, MD & CEO, HDFC Life said, "Business has started to pick up on a month-on-month basis and we are seeing higher traction, especially in\xa0the individual protection business. As the situation begins to normalise, we expect life insurance to emerge\xa0as an important avenue for both protection as well as long term savings, and consequently help attract a\xa0higher quantum of inflows from Indian households."',
 'When it comes to the Coronavirus-led claims, Padalkar said that there are 41 claims filed so far out of which 39 are valid. She added that the claim value is less than Rs 2 crore.',
 'The company had made a provision of Rs 41 crore for COVID-19 at the end of FY20 and has now carried 

In [19]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [20]:
# Combining it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [21]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
BANKS,"In December 2020, a mid-sized private life insurance CEO found out that their bank partner, a South India-based private lender, was in talks with ..."
BUDGET,Removal of tax exemptions under Section 80C in the new tax regime could be dampener for life insurance products as well as equity-linked savings ...
CAPITAL,The capital raising committee of HDFC Life Insurance Company will meet on Thursday to consider raising funds up to Rs 600 crore through issuance o...
FLIPKART,"Flipkart, an e-commerce firm, announced the launch of bus services on its app on April 8. The company has partnered with multiple state transport ..."
HDFC,Private life insurer HDFC Life Insurance posted a 5.8 percent year-on-year (YoY) increase in its June quarter (Q1) consolidated net profit at Rs 4...
LOCKDOWN,The nationwide lockdown due to the coronavirus outbreak that began on March 25 continued to have an impact on the new premium collections of life ...
MICROSOFT,"Microsoft has announced that it is opening an all-new AI hub in London. Mustafa Suleyman, CEO, Microsoft AI, made the announcement in an official ..."
PLUTO,"Some 94 years ago on February 18, 1930, American astronomer Clyde Tombaugh discovered Pluto and named it as the ninth planet in our solar system. ..."
POKEMON,"In a creative effort to inform others about today’s total solar eclipse, a die-hard fan of Pokémon on Nintendo 3DS has used his passion for the ga..."
SOLAR,"A total solar eclipse is poised to adorn the skies over North America today, . Total eclipses, known for their dramatic effect of darkening the sk..."


In [22]:
# Let's take a look at the transcript for dave
data_df.transcript.loc['SPOTIFY']

'Popular music streaming platform Spotify has announced a new feature called AI playlist. As the name suggests, users can now create AI-generated playlists. The features is being made available in the UK and Australia for now. Only Spotify Premium subscribers will have access to the feature. How will the feature work? Spotify says that users need to put a text prompt and a playlist will be generated. For example, if a user types“an indie folk playlist to give my brain a big warm hug,” “relaxing music to tide me over during allergy season,” or “a playlist that makes me feel like the main character”?, AI Playlist will create a list of curated songs. “Whether you’re a beginner or an expert playlist creator, AI Playlist pairs our powerful personalisation technology with AI to deliver that perfect musical mix just for you,” said Spotify. How to use the AI Playlist feature? Users in the UK and Australia on iOS and Android can now curate AI playlists. To find AI Playlist, head to the Spotify 

In [23]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [24]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
BANKS,in december a midsized private life insurance ceo found out that their bank partner a south indiabased private lender was in talks with a rival f...
BUDGET,removal of tax exemptions under section in the new tax regime could be dampener for life insurance products as well as equitylinked savings sche...
CAPITAL,the capital raising committee of hdfc life insurance company will meet on thursday to consider raising funds up to rs crore through issuance of d...
FLIPKART,flipkart an ecommerce firm announced the launch of bus services on its app on april the company has partnered with multiple state transport corpo...
HDFC,private life insurer hdfc life insurance posted a percent yearonyear yoy increase in its june quarter consolidated net profit at rs crore this ...
LOCKDOWN,the nationwide lockdown due to the coronavirus outbreak that began on march continued to have an impact on the new premium collections of life in...
MICROSOFT,microsoft has announced that it is opening an allnew ai hub in london mustafa suleyman ceo microsoft ai made the announcement in an official blog ...
PLUTO,some years ago on february american astronomer clyde tombaugh discovered pluto and named it as the ninth planet in our solar system soon after ...
POKEMON,in a creative effort to inform others about today’s total solar eclipse a diehard fan of pokémon on nintendo has used his passion for the gaming ...
SOLAR,a total solar eclipse is poised to adorn the skies over north america today total eclipses known for their dramatic effect of darkening the sky a...


In [25]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Getting rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [26]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
BANKS,in december a midsized private life insurance ceo found out that their bank partner a south indiabased private lender was in talks with a rival f...
BUDGET,removal of tax exemptions under section in the new tax regime could be dampener for life insurance products as well as equitylinked savings sche...
CAPITAL,the capital raising committee of hdfc life insurance company will meet on thursday to consider raising funds up to rs crore through issuance of d...
FLIPKART,flipkart an ecommerce firm announced the launch of bus services on its app on april the company has partnered with multiple state transport corpo...
HDFC,private life insurer hdfc life insurance posted a percent yearonyear yoy increase in its june quarter consolidated net profit at rs crore this ...
LOCKDOWN,the nationwide lockdown due to the coronavirus outbreak that began on march continued to have an impact on the new premium collections of life in...
MICROSOFT,microsoft has announced that it is opening an allnew ai hub in london mustafa suleyman ceo microsoft ai made the announcement in an official blog ...
PLUTO,some years ago on february american astronomer clyde tombaugh discovered pluto and named it as the ninth planet in our solar system soon after ...
POKEMON,in a creative effort to inform others about todays total solar eclipse a diehard fan of pokémon on nintendo has used his passion for the gaming c...
SOLAR,a total solar eclipse is poised to adorn the skies over north america today total eclipses known for their dramatic effect of darkening the sky a...


In [27]:
# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
BANKS,"In December 2020, a mid-sized private life insurance CEO found out that their bank partner, a South India-based private lender, was in talks with ..."
BUDGET,Removal of tax exemptions under Section 80C in the new tax regime could be dampener for life insurance products as well as equity-linked savings ...
CAPITAL,The capital raising committee of HDFC Life Insurance Company will meet on Thursday to consider raising funds up to Rs 600 crore through issuance o...
FLIPKART,"Flipkart, an e-commerce firm, announced the launch of bus services on its app on April 8. The company has partnered with multiple state transport ..."
HDFC,Private life insurer HDFC Life Insurance posted a 5.8 percent year-on-year (YoY) increase in its June quarter (Q1) consolidated net profit at Rs 4...
LOCKDOWN,The nationwide lockdown due to the coronavirus outbreak that began on March 25 continued to have an impact on the new premium collections of life ...
MICROSOFT,"Microsoft has announced that it is opening an all-new AI hub in London. Mustafa Suleyman, CEO, Microsoft AI, made the announcement in an official ..."
PLUTO,"Some 94 years ago on February 18, 1930, American astronomer Clyde Tombaugh discovered Pluto and named it as the ninth planet in our solar system. ..."
POKEMON,"In a creative effort to inform others about today’s total solar eclipse, a die-hard fan of Pokémon on Nintendo 3DS has used his passion for the ga..."
SOLAR,"A total solar eclipse is poised to adorn the skies over North America today, . Total eclipses, known for their dramatic effect of darkening the sk..."


In [28]:
# Let's add the comedians' full names as well
full_names = ['HDFC BANK', 'CAPITAL RAISING', 'COVID LOCKDOWN', 'HOT STOCKS', 'BANKS CHARGE', 'BUDGET 2020 ', 'PLUTO PLANET', 'POKEMON', 'SOLAR ECLIPSE', 'SPOTIFY', 'MICROSOFT', 'FLIPKART']
full_names.sort()
data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
BANKS,"In December 2020, a mid-sized private life insurance CEO found out that their bank partner, a South India-based private lender, was in talks with ...",BANKS CHARGE
BUDGET,Removal of tax exemptions under Section 80C in the new tax regime could be dampener for life insurance products as well as equity-linked savings ...,BUDGET 2020
CAPITAL,The capital raising committee of HDFC Life Insurance Company will meet on Thursday to consider raising funds up to Rs 600 crore through issuance o...,CAPITAL RAISING
FLIPKART,"Flipkart, an e-commerce firm, announced the launch of bus services on its app on April 8. The company has partnered with multiple state transport ...",COVID LOCKDOWN
HDFC,Private life insurer HDFC Life Insurance posted a 5.8 percent year-on-year (YoY) increase in its June quarter (Q1) consolidated net profit at Rs 4...,FLIPKART
LOCKDOWN,The nationwide lockdown due to the coronavirus outbreak that began on March 25 continued to have an impact on the new premium collections of life ...,HDFC BANK
MICROSOFT,"Microsoft has announced that it is opening an all-new AI hub in London. Mustafa Suleyman, CEO, Microsoft AI, made the announcement in an official ...",HOT STOCKS
PLUTO,"Some 94 years ago on February 18, 1930, American astronomer Clyde Tombaugh discovered Pluto and named it as the ninth planet in our solar system. ...",MICROSOFT
POKEMON,"In a creative effort to inform others about today’s total solar eclipse, a die-hard fan of Pokémon on Nintendo 3DS has used his passion for the ga...",PLUTO PLANET
SOLAR,"A total solar eclipse is poised to adorn the skies over North America today, . Total eclipses, known for their dramatic effect of darkening the sk...",POKEMON


In [29]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [30]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index


In [31]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [32]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

# Topic Modeling

## Introduction

Another popular text analysis technique is called topic modeling. The ultimate goal of topic modeling is to find various topics that are present in your corpus. Each document in the corpus will be made up of at least one topic, if not multiple topics.

In this notebook, we will be covering the steps on how to do **Latent Dirichlet Allocation (LDA)**, which is one of many topic modeling techniques. It was specifically designed for text data.

To use a topic modeling technique, you need to provide (1) a document-term matrix and (2) the number of topics you would like the algorithm to pick up.

Once the topic modeling technique is applied, your job as a human is to interpret the results and see if the mix of words in each topic make sense. If they don't make sense, you can try changing up the number of topics, the terms in the document-term matrix, model parameters, or even try a different model.

## Topic Modeling - Attempt #1 (All Text)

In [33]:
import pandas as pd
import pickle

data = pd.read_pickle('dtm.pkl')
data

Unnamed: 0,able,access,accessible,accident,according,account,accounted,accumulate,accuracy,accurate,...,yadav,year,yearago,yearly,yearonyear,years,youre,yoy,zero,zone
BANKS,0,1,0,1,1,3,1,0,0,0,...,0,1,0,1,0,1,0,0,0,0
BUDGET,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,3,0,0,0,0
CAPITAL,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
FLIPKART,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
HDFC,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,4,0,0
LOCKDOWN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,7,0,0
MICROSOFT,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PLUTO,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
POKEMON,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
SOLAR,1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [35]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,BANKS,BUDGET,CAPITAL,FLIPKART,HDFC,LOCKDOWN,MICROSOFT,PLUTO,POKEMON,SOLAR,SPOTIFY,STOCKS
able,0,0,0,0,0,0,0,0,0,1,0,0
access,1,0,0,0,0,0,0,0,0,1,1,0
accessible,0,0,0,0,0,0,0,0,1,0,0,0
accident,1,0,0,0,0,0,0,0,0,0,0,0
according,1,0,0,1,0,0,1,1,0,0,0,0


In [36]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [37]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term), we need to specify two other parameters - the number of topics and the number of passes. Let's start the number of topics at 2, see if the results make sense, and increase the number from there.

In [38]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.014*"rs" + 0.012*"ai" + 0.009*"pluto" + 0.008*"microsoft" + 0.006*"stock" + 0.006*"planet" + 0.005*"travel" + 0.005*"daily" + 0.005*"said" + 0.005*"level"'),
 (1,
  '0.025*"life" + 0.019*"insurance" + 0.018*"percent" + 0.014*"eclipse" + 0.012*"new" + 0.011*"rs" + 0.009*"crore" + 0.008*"insurers" + 0.008*"premium" + 0.007*"business"')]

In [39]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.019*"ai" + 0.015*"life" + 0.010*"rs" + 0.010*"pluto" + 0.010*"crore" + 0.010*"vaccine" + 0.009*"microsoft" + 0.009*"percent" + 0.008*"new" + 0.007*"said"'),
 (1,
  '0.023*"life" + 0.022*"insurance" + 0.019*"percent" + 0.019*"eclipse" + 0.012*"new" + 0.010*"business" + 0.008*"rs" + 0.008*"said" + 0.007*"bank" + 0.007*"solar"'),
 (2,
  '0.026*"rs" + 0.011*"stock" + 0.010*"daily" + 0.009*"level" + 0.009*"range" + 0.008*"trading" + 0.008*"target" + 0.008*"nifty" + 0.008*"loss" + 0.008*"pattern"')]

In [40]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.018*"pluto" + 0.012*"company" + 0.012*"planet" + 0.010*"said" + 0.010*"travel" + 0.009*"bus" + 0.008*"orbit" + 0.007*"lowell" + 0.006*"life" + 0.006*"percent"'),
 (1,
  '0.035*"eclipse" + 0.026*"ai" + 0.013*"solar" + 0.012*"microsoft" + 0.009*"playlist" + 0.008*"total" + 0.007*"uk" + 0.007*"totality" + 0.007*"path" + 0.006*"app"'),
 (2,
  '0.023*"insurance" + 0.020*"rs" + 0.020*"life" + 0.013*"percent" + 0.013*"bank" + 0.011*"banks" + 0.010*"business" + 0.008*"insurers" + 0.008*"stock" + 0.008*"private"'),
 (3,
  '0.033*"life" + 0.025*"percent" + 0.023*"new" + 0.019*"rs" + 0.018*"insurance" + 0.017*"crore" + 0.012*"premium" + 0.011*"vaccine" + 0.011*"regime" + 0.011*"insurers"')]

These topics aren't looking too great. We've tried modifying our parameters. Let's try modifying our terms list as well.

## Topic Modeling - Attempt #2 (Nouns Only)

One popular trick is to look only at terms that are from one part of speech (only nouns, only adjectives, etc.). Check out the UPenn tag set: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html.

In [41]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [42]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcript
BANKS,in december a midsized private life insurance ceo found out that their bank partner a south indiabased private lender was in talks with a rival f...
BUDGET,removal of tax exemptions under section in the new tax regime could be dampener for life insurance products as well as equitylinked savings sche...
CAPITAL,the capital raising committee of hdfc life insurance company will meet on thursday to consider raising funds up to rs crore through issuance of d...
FLIPKART,flipkart an ecommerce firm announced the launch of bus services on its app on april the company has partnered with multiple state transport corpo...
HDFC,private life insurer hdfc life insurance posted a percent yearonyear yoy increase in its june quarter consolidated net profit at rs crore this ...
LOCKDOWN,the nationwide lockdown due to the coronavirus outbreak that began on march continued to have an impact on the new premium collections of life in...
MICROSOFT,microsoft has announced that it is opening an allnew ai hub in london mustafa suleyman ceo microsoft ai made the announcement in an official blog ...
PLUTO,some years ago on february american astronomer clyde tombaugh discovered pluto and named it as the ninth planet in our solar system soon after ...
POKEMON,in a creative effort to inform others about todays total solar eclipse a diehard fan of pokémon on nintendo has used his passion for the gaming c...
SOLAR,a total solar eclipse is poised to adorn the skies over north america today total eclipses known for their dramatic effect of darkening the sky a...


In [43]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [44]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [45]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

Unnamed: 0,transcript
BANKS,december life insurance ceo bank partner south lender talks firm tieup problem insurance company half revenues bank ceo management bank breakup me...
BUDGET,removal tax exemptions section tax regime life insurance products savings schemes funds budget speech finance minister nirmala sitharaman opting t...
CAPITAL,capital committee life insurance company thursday funds issuance debentures filing hdfc life insurance company meeting capital committee crc board...
FLIPKART,ecommerce firm launch bus services app company state transport corporations aggregators customers connections connectivity routes india addition f...
HDFC,life insurer life insurance percent yearonyear increase quarter profit rs crore back improvement investment income decline income vibha padalkar m...
LOCKDOWN,lockdown coronavirus march impact collections life insurers collections percent yoy crore insurers life insurance corporation decrease collection ...
MICROSOFT,microsoft allnew ai hub suleyman ceo microsoft ai announcement blog post ai london work language models infrastructure worldclass foundation model...
PLUTO,years astronomer clyde pluto planet system caltech professor astronomy plutos status pluto topic debate science community members union criteria p...
POKEMON,effort others todays eclipse fan pokémon nintendo passion gaming console simulation sun moon cartridges pokémon nintendo gaming fan depiction phen...
SOLAR,eclipse skies america today eclipses effect sky rarity locations phenomenon radius tools viewing photography below capture images eclipse simulati...


In [46]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Additional stop words
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = list(text.ENGLISH_STOP_WORDS.union(add_stop_words))  # Convert to list

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)

# Get feature names
feature_names = cvn.get_feature_names_out()

# Create DataFrame
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=feature_names)
data_dtmn.index = data_nouns.index
print(data_dtmn)


           access  accident  account  accuracy  activities  activity  \
BANKS           1         1        2         0           0         0   
BUDGET          0         0        0         0           0         0   
CAPITAL         0         0        0         0           0         0   
FLIPKART        0         0        0         0           0         0   
HDFC            0         0        0         0           0         1   
LOCKDOWN        0         0        0         0           0         0   
MICROSOFT       0         0        0         0           0         0   
PLUTO           0         0        0         0           0         0   
POKEMON         0         0        1         0           0         0   
SOLAR           1         0        0         1           1         0   
SPOTIFY         1         0        0         0           0         0   
STOCKS          0         0        0         0           0         0   

           addition  aditya  agency  agent  ...  work  workforc

In [47]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [48]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()


[(0,
  '0.028*"eclipse" + 0.022*"life" + 0.018*"percent" + 0.013*"crore" + 0.012*"vaccine" + 0.010*"pluto" + 0.009*"company" + 0.008*"ai" + 0.008*"insurers" + 0.008*"insurance"'),
 (1,
  '0.034*"life" + 0.033*"insurance" + 0.022*"rs" + 0.022*"percent" + 0.013*"bank" + 0.011*"business" + 0.010*"insurers" + 0.010*"banks" + 0.010*"regime" + 0.009*"stock"')]

In [49]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.043*"rs" + 0.017*"stock" + 0.016*"percent" + 0.013*"range" + 0.011*"pattern" + 0.011*"loss" + 0.011*"target" + 0.009*"level" + 0.009*"breakout" + 0.009*"index"'),
 (1,
  '0.055*"life" + 0.042*"insurance" + 0.032*"percent" + 0.019*"insurers" + 0.013*"bank" + 0.012*"business" + 0.012*"vaccine" + 0.012*"premium" + 0.011*"crore" + 0.011*"regime"'),
 (2,
  '0.045*"eclipse" + 0.016*"ai" + 0.012*"playlist" + 0.011*"app" + 0.011*"totality" + 0.010*"path" + 0.009*"microsoft" + 0.009*"company" + 0.009*"travel" + 0.008*"users"')]

In [50]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.034*"rs" + 0.032*"life" + 0.024*"insurance" + 0.023*"percent" + 0.015*"regime" + 0.012*"stock" + 0.011*"company" + 0.011*"ai" + 0.011*"tax" + 0.010*"crore"'),
 (1,
  '0.039*"insurance" + 0.035*"life" + 0.021*"percent" + 0.021*"bank" + 0.019*"banks" + 0.018*"pluto" + 0.018*"business" + 0.014*"insurers" + 0.014*"planet" + 0.013*"playlist"'),
 (2,
  '0.022*"travel" + 0.019*"bus" + 0.012*"percent" + 0.012*"company" + 0.012*"bookings" + 0.012*"report" + 0.012*"growth" + 0.008*"india" + 0.008*"increase" + 0.008*"customers"'),
 (3,
  '0.053*"eclipse" + 0.028*"life" + 0.022*"vaccine" + 0.019*"percent" + 0.016*"insurers" + 0.016*"crore" + 0.012*"path" + 0.012*"totality" + 0.011*"yoy" + 0.011*"decrease"')]

## Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [51]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(nouns_adj)

In [52]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,transcript
BANKS,december midsized private life insurance ceo bank partner south indiabased private lender talks rival firm possible tieup problem insurance compan...
BUDGET,removal tax exemptions section new tax regime life insurance products savings schemes mutual funds budget speech february finance minister nirmala...
CAPITAL,capital committee hdfc life insurance company thursday funds issuance debentures regulatory filing hdfc life insurance company meeting capital com...
FLIPKART,ecommerce firm launch bus services app company multiple state transport corporations private aggregators customers bus connections connectivity ro...
HDFC,private life insurer hdfc life insurance percent yearonyear yoy increase june quarter consolidated net profit rs crore back improvement investment...
LOCKDOWN,nationwide lockdown due coronavirus march impact new premium collections life insurers firstyear collections percent yearonyear yoy crore private ...
MICROSOFT,microsoft allnew ai hub london mustafa suleyman ceo microsoft ai announcement official blog post microsoft ai london work stateoftheart language m...
PLUTO,years february american astronomer clyde pluto ninth planet solar system caltech professor astronomy brown plutos planetary status pluto topic deb...
POKEMON,creative effort others todays total solar eclipse diehard fan pokémon nintendo passion gaming console accessible simulation sun moon cartridges po...
SOLAR,total solar eclipse skies north america today total eclipses dramatic effect sky rarity observable select locations phenomenon visible narrow radi...


In [53]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
feature_names = cvna.get_feature_names_out()
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=feature_names)
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,able,access,accessible,accident,account,accuracy,accurate,activities,activity,addition,...,works,worldclass,yadav,year,yearago,yearly,yearonyear,years,yoy,zone
BANKS,0,1,0,1,2,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
BUDGET,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3,0,0
CAPITAL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
FLIPKART,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
HDFC,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,1,0,4,0
LOCKDOWN,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,6,0
MICROSOFT,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
PLUTO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
POKEMON,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
SOLAR,1,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [55]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.016*"insurance" + 0.014*"eclipse" + 0.014*"ai" + 0.014*"rs" + 0.014*"life" + 0.011*"percent" + 0.009*"bank" + 0.007*"banks" + 0.007*"microsoft" + 0.007*"business"'),
 (1,
  '0.029*"life" + 0.021*"percent" + 0.019*"new" + 0.016*"insurance" + 0.012*"crore" + 0.010*"premium" + 0.009*"vaccine" + 0.009*"insurers" + 0.008*"regime" + 0.008*"pluto"')]

In [56]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.028*"rs" + 0.025*"ai" + 0.015*"microsoft" + 0.012*"stock" + 0.011*"daily" + 0.010*"range" + 0.008*"stop" + 0.008*"target" + 0.008*"pattern" + 0.008*"loss"'),
 (1,
  '0.038*"life" + 0.029*"insurance" + 0.023*"percent" + 0.021*"eclipse" + 0.016*"new" + 0.013*"insurers" + 0.012*"premium" + 0.010*"private" + 0.009*"bank" + 0.009*"business"'),
 (2,
  '0.016*"pluto" + 0.014*"percent" + 0.012*"planet" + 0.012*"playlist" + 0.009*"travel" + 0.009*"bus" + 0.009*"new" + 0.007*"orbit" + 0.006*"crore" + 0.006*"business"')]

In [57]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.040*"eclipse" + 0.025*"rs" + 0.015*"solar" + 0.011*"stock" + 0.010*"daily" + 0.010*"total" + 0.008*"totality" + 0.008*"range" + 0.008*"path" + 0.007*"loss"'),
 (1,
  '0.037*"life" + 0.024*"new" + 0.023*"percent" + 0.020*"insurance" + 0.014*"vaccine" + 0.013*"insurers" + 0.013*"premium" + 0.013*"regime" + 0.013*"crore" + 0.013*"pluto"'),
 (2,
  '0.028*"playlist" + 0.013*"ai" + 0.013*"feature" + 0.010*"users" + 0.010*"spotify" + 0.007*"uk" + 0.007*"music" + 0.007*"australia" + 0.007*"playlists" + 0.004*"new"'),
 (3,
  '0.029*"insurance" + 0.029*"life" + 0.025*"percent" + 0.018*"ai" + 0.016*"business" + 0.013*"bank" + 0.013*"private" + 0.012*"banks" + 0.012*"company" + 0.011*"microsoft"')]

## Identify Topics in Each Document

Out of the 9 topic models we looked at, the nouns and adjectives, 4 topic one made the most sense. So let's pull that down here and run it through some more iterations to get more fine-tuned topics.

In [58]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.063*"eclipse" + 0.024*"solar" + 0.015*"total" + 0.013*"totality" + 0.013*"path" + 0.010*"app" + 0.008*"minutes" + 0.006*"cloud" + 0.006*"visible" + 0.006*"timer"'),
 (1,
  '0.020*"pluto" + 0.015*"planet" + 0.015*"playlist" + 0.014*"percent" + 0.012*"new" + 0.009*"orbit" + 0.009*"crore" + 0.009*"business" + 0.007*"value" + 0.007*"feature"'),
 (2,
  '0.039*"life" + 0.030*"insurance" + 0.029*"percent" + 0.018*"insurers" + 0.015*"private" + 0.015*"premium" + 0.014*"bank" + 0.013*"vaccine" + 0.012*"new" + 0.012*"banks"'),
 (3,
  '0.026*"rs" + 0.023*"life" + 0.019*"ai" + 0.018*"insurance" + 0.014*"new" + 0.012*"regime" + 0.012*"percent" + 0.011*"microsoft" + 0.010*"stock" + 0.009*"tax"')]

These four topics look pretty decent. Let's settle on these for now.
* Topic 0: insurance,life
* Topic 1: ai,life
* Topic 2: eclipse,life
* Topic 3: rs,stock

In [59]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(2, 'BANKS'),
 (3, 'BUDGET'),
 (3, 'CAPITAL'),
 (2, 'FLIPKART'),
 (1, 'HDFC'),
 (2, 'LOCKDOWN'),
 (3, 'MICROSOFT'),
 (1, 'PLUTO'),
 (0, 'POKEMON'),
 (0, 'SOLAR'),
 (1, 'SPOTIFY'),
 (3, 'STOCKS')]

These four topics look pretty decent. Let's settle on these for now.
* Topic 0: insurance,life[BANKS,CAPITAL,FLIPKART,HDFC,PLUTO]
* Topic 1: ai,life[BUDGET,MICROSOFT]
* Topic 2: eclipse,life[LOCKDOWN,POKEMON,SOLAR]
* Topic 3: rs,stock,playlist[STOCKS,SPOTOFY]

##Assignment

1. Try further modifying the parameters of the topic models above and see if you can get better topics.


In [60]:
# Attempt #1
ldana_1 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=100, alpha='auto', eta='auto')
ldana_1.print_topics()

# Attempt #2
ldana_2 = models.LdaModel(corpus=corpusna, num_topics=6, id2word=id2wordna, passes=150, alpha='auto', eta='auto')
ldana_2.print_topics()

# Attempt #3
ldana_3 = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=200, alpha=0.1, eta=0.01)
ldana_3.print_topics()


[(0,
  '0.044*"life" + 0.033*"insurance" + 0.032*"percent" + 0.021*"rs" + 0.021*"new" + 0.014*"premium" + 0.014*"insurers" + 0.012*"private" + 0.012*"business" + 0.011*"crore"'),
 (1,
  '0.108*"eclipse" + 0.059*"solar" + 0.049*"total" + 0.049*"path" + 0.030*"moon" + 0.030*"pokémon" + 0.020*"sun" + 0.020*"simulation" + 0.020*"event" + 0.020*"totality"'),
 (2,
  '0.046*"pluto" + 0.046*"banks" + 0.040*"bank" + 0.035*"planet" + 0.025*"data" + 0.019*"orbit" + 0.016*"bancassurance" + 0.016*"channel" + 0.016*"complaints" + 0.012*"scientists"'),
 (3,
  '0.064*"eclipse" + 0.064*"ai" + 0.031*"microsoft" + 0.020*"solar" + 0.020*"uk" + 0.016*"app" + 0.014*"suleyman" + 0.014*"totality" + 0.014*"london" + 0.011*"cloud"')]

In [61]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana_1[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(4, 'BANKS'),
 (4, 'BUDGET'),
 (3, 'CAPITAL'),
 (1, 'FLIPKART'),
 (4, 'HDFC'),
 (4, 'LOCKDOWN'),
 (4, 'MICROSOFT'),
 (3, 'PLUTO'),
 (2, 'POKEMON'),
 (1, 'SOLAR'),
 (4, 'SPOTIFY'),
 (3, 'STOCKS')]

2. Create a new topic model that includes terms from a different [part of speech](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) and see if you can get better topics.

In [62]:
# Load the cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Define a function to extract verbs from the text
def verbs(text):
    '''Given a string of text, tokenize the text and pull out only the verbs.'''
    is_verb = lambda pos: pos[:2] == 'VB'  # 'VB' indicates verbs in the Penn Treebank tag set
    tokenized = word_tokenize(text)
    all_verbs = [word for (word, pos) in pos_tag(tokenized) if is_verb(pos)]
    return ' '.join(all_verbs)

# Apply the verbs function to the transcripts to filter only on verbs
data_verbs = pd.DataFrame(data_clean.transcript.apply(verbs))

# Create a new document-term matrix using only verbs
cv_verbs = CountVectorizer()
data_cv_verbs = cv_verbs.fit_transform(data_verbs.transcript)
feature_names_verbs = cv_verbs.get_feature_names_out()
data_dtm_verbs = pd.DataFrame(data_cv_verbs.toarray(), columns=feature_names_verbs)
data_dtm_verbs.index = data_verbs.index

# Create the gensim corpus
corpus_verbs = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm_verbs.transpose()))

# Create the vocabulary dictionary
id2word_verbs = dict((v, k) for k, v in cv_verbs.vocabulary_.items())

# Perform topic modeling on the new document-term matrix
lda_verbs = models.LdaModel(corpus=corpus_verbs, num_topics=4, id2word=id2word_verbs, passes=80)
lda_verbs.print_topics()


[(0,
  '0.003*"span" + 0.003*"using" + 0.003*"occur" + 0.003*"created" + 0.003*"develop" + 0.003*"see" + 0.003*"begin" + 0.003*"put" + 0.003*"says" + 0.003*"hdfc"'),
 (1,
  '0.064*"is" + 0.042*"has" + 0.036*"said" + 0.032*"be" + 0.028*"are" + 0.024*"was" + 0.020*"have" + 0.014*"been" + 0.014*"rs" + 0.010*"buy"'),
 (2,
  '0.049*"is" + 0.038*"are" + 0.029*"saw" + 0.029*"rs" + 0.029*"have" + 0.024*"was" + 0.018*"showed" + 0.018*"be" + 0.012*"been" + 0.012*"given"'),
 (3,
  '0.040*"is" + 0.032*"said" + 0.017*"announced" + 0.017*"make" + 0.017*"hiring" + 0.017*"has" + 0.010*"begin" + 0.010*"according" + 0.010*"made" + 0.010*"create"')]