# Import modules

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Helper function for getting articles content and their titles 

In [2]:
def get_fresh_bbc_news():

    url = 'https://www.bbc.com/news/world'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html5lib' )

    regions = set()
    for link in soup.find_all('a'):
        if '/news/world/' in link.get('href'):
            regions.add(link.get('href').split('/')[3])

    df = pd.DataFrame()
    for region in regions:

        links = set()
        url = f'https://www.bbc.com/news/world/{region}'
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html5lib' )
        for link in soup.find_all('a',{'class':'qa-heading-link lx-stream-post__header-link'}):
            
            if 'live' not in link.get('href') and region.replace('_','-').replace('-and-','-') in link.get('href'):
                links.add('https://www.bbc.com' + link.get('href'))

        links = list(links)

        articles = {}
        for article_link in links:

            r = requests.get(article_link)
            soup = BeautifulSoup(r.text, 'html5lib' )
            
            title = soup.find('h1').text
            text = ''
            
            for txt in soup.find_all('p',{'class':'ssrcss-1q0x1qg-Paragraph eq5iqo00'}):
                text +=(txt.text)
            
            articles[title] = text


        intermediary_df = pd.DataFrame([[region]*len(links),links,list(articles.keys()),list(articles.values())]).T.dropna()
        intermediary_df.columns = ['Region','Article_link','Title','Content']
        df = pd.concat([df,intermediary_df],axis=0)
    
    return df

In [3]:
articles = get_fresh_bbc_news()
articles = articles[['Title','Content']].loc[articles.Content!='']

In [4]:
articles

Unnamed: 0,Title,Content
0,Ukraine: Shortage of accommodation for refugee...,There are no spaces available in state-provide...
1,Italian PM Draghi's government in crisis over ...,Italy's government under Prime Minister Mario ...
2,"Girl, 14, killed in Cobra rollercoaster ride a...",A 14-year-old girl from Copenhagen has died in...
3,Europe wildfires: Heatwave fuels blazes across...,This video can not be playedA heatwave spreadi...
4,Ukraine war: Deal in sight to end Ukrainian gr...,Talks aimed at resuming Ukrainian grain export...
...,...,...
7,Flash of light as meteor spotted over Chile,This video can not be playedA meteor which lit...
8,Nicaragua expels Mother Teresa's nuns in lates...,Nuns from the order founded by Mother Teresa h...
9,Antigua's ban on same-sex acts ruled unconstit...,A law criminalising same-sex acts between cons...
10,Haiti violence: Scores killed as gangs fight f...,At least 89 people are reported to have been k...


# Remove stop words, apply vectorisation and TF-IDF transformation 

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
dtm = tfidf.fit_transform(articles['Content'])

# Instantiate Non Negative Matrix factorisation model and return most frequent words

In [6]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)
nmf.fit(dtm)

no_of_words = len(tfidf.get_feature_names_out())

for i, topic in enumerate(nmf.components_):
    print(f'Topic {i+1} most frequent words:')
    print([tfidf.get_feature_names_out()[x] for x in topic.argsort()[-5:]])

Topic 1 most frequent words:
['killed', 'ms', 'said', 'violence', 'police']
Topic 2 most frequent words:
['lanka', 'minister', 'rajapaksa', 'prime', 'sri']
Topic 3 most frequent words:
['trump', 'iran', 'president', 'mr', 'biden']
Topic 4 most frequent words:
['war', 'ukrainian', 'russian', 'ukraine', 'russia']
Topic 5 most frequent words:
['emergency', 'roads', 'australia', 'flooding', 'sydney']
Topic 6 most frequent words:
['dozens', 'said', 'body', '000', 'burned']
Topic 7 most frequent words:
['security', 'protests', 'iranian', 'arrested', 'iran']
Topic 8 most frequent words:
['australia', 'legal', 'mr', 'case', 'court']
Topic 9 most frequent words:
['flag', 'new', 'party', 'president', 'government']
Topic 10 most frequent words:
['million', 'stay', 'community', 'said', 'use']


# Assign Topic number to each article

In [7]:
topic_results = nmf.transform(dtm)
topic_results = topic_results.argmax(axis=1)
articles['Topic'] = topic_results

In [8]:
articles

Unnamed: 0,Title,Content,Topic
0,Ukraine: Shortage of accommodation for refugee...,There are no spaces available in state-provide...,3
1,Italian PM Draghi's government in crisis over ...,Italy's government under Prime Minister Mario ...,8
2,"Girl, 14, killed in Cobra rollercoaster ride a...",A 14-year-old girl from Copenhagen has died in...,0
3,Europe wildfires: Heatwave fuels blazes across...,This video can not be playedA heatwave spreadi...,5
4,Ukraine war: Deal in sight to end Ukrainian gr...,Talks aimed at resuming Ukrainian grain export...,3
...,...,...,...
7,Flash of light as meteor spotted over Chile,This video can not be playedA meteor which lit...,5
8,Nicaragua expels Mother Teresa's nuns in lates...,Nuns from the order founded by Mother Teresa h...,8
9,Antigua's ban on same-sex acts ruled unconstit...,A law criminalising same-sex acts between cons...,7
10,Haiti violence: Scores killed as gangs fight f...,At least 89 people are reported to have been k...,5


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def split_into_topics(articles):

    tfidf = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
    dtm = tfidf.fit_transform(articles['Content'])

    nmf = NMF(n_components=10, random_state=42)
    nmf.fit(dtm)

    no_of_words = len(tfidf.get_feature_names_out())

    for i, topic in enumerate(nmf.components_):
        print(f'Topic {i+1} most frequent words:')
        print([tfidf.get_feature_names_out()[x] for x in topic.argsort()[-5:]])


    topic_results = nmf.transform(dtm)
    topic_results = topic_results.argmax(axis=1)
    articles['Topic'] = topic_results

    return articles


In [10]:
split_into_topics(articles)

Topic 1 most frequent words:
['killed', 'ms', 'said', 'violence', 'police']
Topic 2 most frequent words:
['lanka', 'minister', 'rajapaksa', 'prime', 'sri']
Topic 3 most frequent words:
['trump', 'iran', 'president', 'mr', 'biden']
Topic 4 most frequent words:
['war', 'ukrainian', 'russian', 'ukraine', 'russia']
Topic 5 most frequent words:
['emergency', 'roads', 'australia', 'flooding', 'sydney']
Topic 6 most frequent words:
['dozens', 'said', 'body', '000', 'burned']
Topic 7 most frequent words:
['security', 'protests', 'iranian', 'arrested', 'iran']
Topic 8 most frequent words:
['australia', 'legal', 'mr', 'case', 'court']
Topic 9 most frequent words:
['flag', 'new', 'party', 'president', 'government']
Topic 10 most frequent words:
['million', 'stay', 'community', 'said', 'use']


Unnamed: 0,Title,Content,Topic
0,Ukraine: Shortage of accommodation for refugee...,There are no spaces available in state-provide...,3
1,Italian PM Draghi's government in crisis over ...,Italy's government under Prime Minister Mario ...,8
2,"Girl, 14, killed in Cobra rollercoaster ride a...",A 14-year-old girl from Copenhagen has died in...,0
3,Europe wildfires: Heatwave fuels blazes across...,This video can not be playedA heatwave spreadi...,5
4,Ukraine war: Deal in sight to end Ukrainian gr...,Talks aimed at resuming Ukrainian grain export...,3
...,...,...,...
7,Flash of light as meteor spotted over Chile,This video can not be playedA meteor which lit...,5
8,Nicaragua expels Mother Teresa's nuns in lates...,Nuns from the order founded by Mother Teresa h...,8
9,Antigua's ban on same-sex acts ruled unconstit...,A law criminalising same-sex acts between cons...,7
10,Haiti violence: Scores killed as gangs fight f...,At least 89 people are reported to have been k...,5
