# Import modules

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Helper function for getting articles content and their titles 

In [2]:
def get_fresh_bbc_news():

    url = 'https://www.bbc.com/news/world'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html5lib' )

    regions = set()
    for link in soup.find_all('a'):
        if '/news/world/' in link.get('href'):
            regions.add(link.get('href').split('/')[3])

    df = pd.DataFrame()
    for region in regions:

        links = set()
        url = f'https://www.bbc.com/news/world/{region}'
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html5lib' )
        for link in soup.find_all('a',{'class':'qa-heading-link lx-stream-post__header-link'}):
            
            if 'live' not in link.get('href') and region.replace('_','-').replace('-and-','-') in link.get('href'):
                links.add('https://www.bbc.com' + link.get('href'))

        links = list(links)

        articles = {}
        for article_link in links:

            r = requests.get(article_link)
            soup = BeautifulSoup(r.text, 'html5lib' )
            
            title = soup.find('h1').text
            text = ''
            
            for txt in soup.find_all('p',{'class':'ssrcss-1q0x1qg-Paragraph eq5iqo00'}):
                text +=(txt.text)
            
            articles[title] = text


        intermediary_df = pd.DataFrame([[region]*len(links),links,list(articles.keys()),list(articles.values())]).T.dropna()
        intermediary_df.columns = ['Region','Article_link','Title','Content']
        df = pd.concat([df,intermediary_df],axis=0)
    
    return df

In [3]:
articles = get_fresh_bbc_news()
articles = articles[['Title','Content']].loc[articles.Content!='']

In [4]:
articles

Unnamed: 0,Title,Content
0,Australia floods: Kangaroo rescued from deep w...,This video can not be playedA kangaroo has bee...
1,Twelve religious group members arrested over A...,Twelve members of a religious group have been ...
2,Sydney floods: Tens of thousands told to evacuate,This video can not be playedTens of thousands ...
3,Australia census: Five ways the country is cha...,The results of Australia's five-yearly census ...
4,Sydney climate protests: Activists block stree...,Climate protesters have brought parts of Sydne...
...,...,...
3,Jafar Panahi: Acclaimed film maker held in Ira...,The award-winning Iranian film maker Jafar Pan...
4,Saudi artists confront change as kingdom shifts,"""As a child sleeping in the back of my parents..."
5,Jenin in the West Bank: Guns and grief on the ...,This video can not be playedIsrael's military ...
6,Christian numbers in Iraq dangerously low - Ar...,This video can not be playedAn Iraqi Christian...


# Remove stop words, apply vectorisation and TF-IDF transformation 

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')
dtm = tfidf.fit_transform(articles['Content'])

# Instantiate Non Negative Matrix factorisation model and return most frequent words

In [32]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)
nmf.fit(dtm)

no_of_words = len(tfidf.get_feature_names_out())

for i, topic in enumerate(nmf.components_):
    print(f'Topic {i+1} most frequent words:')
    print([tfidf.get_feature_names_out()[x] for x in topic.argsort()[-5:]])

Topic 1 most frequent words:
['president', 'said', 'party', 'government', 'mr']
Topic 2 most frequent words:
['white', 'iran', 'president', 'mr', 'biden']
Topic 3 most frequent words:
['gotabaya', 'president', 'lanka', 'rajapaksa', 'sri']
Topic 4 most frequent words:
['roads', 'flood', 'rain', 'australia', 'sydney']
Topic 5 most frequent words:
['ukrainian', 'war', 'russian', 'russia', 'ukraine']
Topic 6 most frequent words:
['media', 'indian', 'arrested', 'india', 'film']
Topic 7 most frequent words:
['supreme', 'states', 'court', 'ms', 'said']
Topic 8 most frequent words:
['killed', 'arrested', 'said', 'people', 'police']
Topic 9 most frequent words:
['summit', 'australian', 'met', 'minister', 'prime']
Topic 10 most frequent words:
['areas', 'national', '000', 'space', 'fires']


# Assign Topic number to each article

In [28]:
topic_results = nmf.transform(dtm)
topic_results = topic_results.argmax(axis=1)
articles['Topic'] = topic_results

In [29]:
articles

Unnamed: 0,Title,Content,Topic
0,Australia floods: Kangaroo rescued from deep w...,This video can not be playedA kangaroo has bee...,3
1,Twelve religious group members arrested over A...,Twelve members of a religious group have been ...,7
2,Sydney floods: Tens of thousands told to evacuate,This video can not be playedTens of thousands ...,3
3,Australia census: Five ways the country is cha...,The results of Australia's five-yearly census ...,3
4,Sydney climate protests: Activists block stree...,Climate protesters have brought parts of Sydne...,3
...,...,...,...
3,Jafar Panahi: Acclaimed film maker held in Ira...,The award-winning Iranian film maker Jafar Pan...,5
4,Saudi artists confront change as kingdom shifts,"""As a child sleeping in the back of my parents...",9
5,Jenin in the West Bank: Guns and grief on the ...,This video can not be playedIsrael's military ...,7
6,Christian numbers in Iraq dangerously low - Ar...,This video can not be playedAn Iraqi Christian...,9
