1.Застосувати приховане семантичне індексування бібліотеки scikit-learn для моделювання тем. Вивести документи, що зробили найбільший вклад в теми. Вивести найбільш важливі теми для випадково обраних чотирьох документів.
2. Використати текст austen-sense.txt з корпусу gutenberg бібліотеки nltk та вивести ключові біграми.

## Завантаження файлу у датафрейм

In [1]:
import pandas as pd

bbc_corpus = pd.read_csv('bbc-news-data.csv', sep='\t')
bbc_corpus

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Попередня обробка корпусу

In [2]:
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))
tokenizer = WordPunctTokenizer()


def preprocess_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = tokenizer.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    doc = ' '.join(filtered_tokens)
    return doc

In [3]:
bbc_corpus['content'] = bbc_corpus['content'].apply(preprocess_document)
bbc_corpus['content'].head()

0    quarterly profits media giant timewarner jumpe...
1    dollar hit highest level euro almost three mon...
2    owners embattled russian oil giant yukos ask b...
3    british airways blamed high fuel prices drop p...
4    shares drinks food firm allied domecq risen sp...
Name: content, dtype: object

## Використання моделі TF-IDF та застосування прихованого семантичного індексування

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

cv = TfidfVectorizer(min_df=10, max_df=0.8, ngram_range=(1, 2))
cv_features = cv.fit_transform(bbc_corpus['content'])
vocabulary = np.array(cv.get_feature_names_out())

In [5]:
from sklearn.decomposition import TruncatedSVD

n_topics = 8
lsi_model = TruncatedSVD(n_components=n_topics, random_state=1234)
document_topics = lsi_model.fit_transform(cv_features)
document_topics

array([[ 0.18218142,  0.02098702, -0.17026098, ...,  0.02469057,
        -0.0096781 , -0.02168635],
       [ 0.19756368,  0.0714441 , -0.07970341, ..., -0.05810203,
        -0.03624525,  0.05365706],
       [ 0.11370094,  0.05905951, -0.05382132, ...,  0.31311135,
         0.01192182, -0.02488919],
       ...,
       [ 0.2504809 ,  0.06220223, -0.06432383, ...,  0.12926598,
        -0.0288901 ,  0.15075599],
       [ 0.15074839,  0.02666993, -0.07701499, ...,  0.05068757,
        -0.04511466,  0.15102995],
       [ 0.36115399, -0.13225848,  0.00265985, ..., -0.05134182,
         0.01199036, -0.09185554]])

## Документи, що зробили найбільший вклад у теми

In [6]:
top_n = 5

top_documents_per_topic = {'document': [], 'contribution': [], 'topic': []}

for topic_idx in range(document_topics.shape[1]):
    contributions = np.abs(document_topics[:, topic_idx])
    total_contribution = contributions.sum()
    top_documents_indices = np.argsort(contributions)[::-1][:top_n]
    top_contributions = (contributions[top_documents_indices] / total_contribution) * 100

    top_documents_per_topic['topic'].extend([topic_idx] * top_n)
    top_docs = bbc_corpus['content'][top_documents_indices]
    top_documents_per_topic['document'].extend(top_docs)
    top_documents_per_topic['contribution'].extend(top_contributions)

top_documents_per_topic = pd.DataFrame(top_documents_per_topic)
top_documents_per_topic.set_index('topic')

Unnamed: 0_level_0,document,contribution
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,lord chancellor defended government plans intr...,0.094505
0,online role playing games timeconsuming enthra...,0.09324
0,new bigscreen version magic roundabout release...,0.086246
0,general election best chance pressure groups g...,0.084404
0,called masochism strategy runup iraq war tony ...,0.080968
1,tony blair said voters wait labours manifesto ...,0.186577
1,tony blair launched attack conservative spendi...,0.184959
1,michael howard finally revealed full scale pla...,0.182582
1,tony blairs feud gordon brown damaging way gov...,0.176457
1,tony blair sought reassure labour backbenchers...,0.173624


## Найбільш важливі теми для випадково обраних чотирьох документів

In [7]:
top_topics = 5
documents_indices = np.random.choice(document_topics.shape[0], size=4, replace=False)
random_documents = document_topics[documents_indices]
most_important_topics = np.argsort(np.abs(random_documents), axis=1)[:, ::-1][:, :top_topics]
most_important_topics = pd.DataFrame(most_important_topics, index=[bbc_corpus['content'][documents_indices]],
                                     columns=[f'Top-{i + 1}' for i in range(top_topics)])
most_important_topics

Unnamed: 0_level_0,Top-1,Top-2,Top-3,Top-4,Top-5
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
british producers wife swap taking legal action show claim blatant wholescale copycat programme rdf media makes show network abc filed damages claim million million foxs trading spouses abc bought rights british show first aired became hit channel network part claim supported rdfs action respect producing partners right protect intellectual property whatever manner deem appropriate said abc statement spokesman fox said seen details legal action could comment show first screened june criticised press similarities wife swap abc originally planned call programme trading moms changed avoid confusion fox version earlier year nbc network claimed foxs boxing show next great champ hurriedly produced ensure programme first screened nbc alleged boxing regulations violated failed attempt show pulled fox show proved ratings flop nbcs contender due begin february,0,3,5,2,6
pensioners promised energy savings liberal democrats snow cold temperatures continue party says plans could save average pensioner every year cut winter deaths government gives winter fuel households people people tories promise keep payments lib dems would allow people swap winter fuel payments discounts home insulation shadow local government secretary davey said current scheme helped older people new liberal democrat approach much end scandal tens thousands old people dying winter cold every year vouchers designed let pensioners choose list approved energy supplies would compete business offering discounts home insulation schemes plan would boost energy conservation says party insulation could save every year pensioner households using money intelligently present,0,1,4,2,3
china overtook become japans biggest trading partner according numbers released japans finance ministry wednesday china accounted japans trade compared ahead china came second change highlights chinas growing importance economic powerhouse japans imports exports china hong kong added yen bnbn highest figure japanese trade china since records began compares yen trade trade hurt oneoff factors including month ban beef imports following discovery cow infected mad cow disease bse however economists predict china become even important japanese trading partner coming years tuesday figures showed chinas economy grew experts say overall growth picture remains strong analysts see two spurs future growth chinas membership world trade organisation lower trade tariffs japans trade surplus grew trillion yen half surplus trillion yen accounted trade december surplus grew year ago trillion yen thanks strongerthanexpected exports,4,0,2,5,3
david mcletchie resigned post partner legal firm following criticism dual role scottish conservative leader insisted legal work tods murray influence causes supports friday said tendered resignation partner immediate effect mcletchie received advice holyrood officials details needed declare labour said cleverly asked paid advocacy tory spokesman totally refuted wrongdoing mcletchie received advice clerk standards committee concern signing parliamentary motion questioning expansion plans edinburgh airport msp partner tods murray client opposing development mcletchie complaint made concerns raised sought guidance standards committee clarify position advised exercise judgement avoid perception conflict said done nothing wrong explaining reason quitting post mcletchie said greatly concerned recent publicity surrounding association tods murray however wish see similar situation arise avoid misconceptions future mindful good name tods murray confidentiality clients entitled brought forward date retirement firm would otherwise happened later year proud part tods murray last years wish well future labour msp christine may said mcletchie clever ask clerk consider conduct respect section code almost bound get answer wanted enquiry since stands accused breaching section section paid advocacy said section members interest order legally obliges msps declare registrable interests taking part related parliamentary proceedings interest would prejudice give appearance prejudicing ability participate disinterested manner however msps code conduct recognises wider definition parliamentary proceedings including nonstatutory requirement make declaration relation written notices motions letter holyroods chamber office chief ken hughes also made clear mcletchie need list clients worked solicitor commenting mcletchies decision stand scottish labour party spokesman said mean mcletchie doesnt breach paid advocacy rules future however doesnt change fact full investigation whether done past scottish national party holyrood leader nicola sturgeon accused mcletchie failing properly serve constituents sturgeon said think whole episode damaging mcletchie sure reflecting added thought tories irrelevant party would lose sleep peter misselbrook executive partner tods murray said mcletchie considering retirement later year added david decided announcement made fully understand appreciate reasons,0,1,2,5,7


## Терми, що ідендифікують теми

In [8]:
terms_topics = np.argsort(np.abs(lsi_model.components_), axis=1)[:, ::-1][:, :5]
vocabulary[terms_topics]

array([['would', 'people', 'new', 'also', 'year'],
       ['labour', 'election', 'film', 'blair', 'best'],
       ['labour', 'election', 'blair', 'mobile', 'music'],
       ['film', 'best', 'awards', 'award', 'films'],
       ['people', 'growth', 'mobile', 'economy', 'users'],
       ['yukos', 'economy', 'growth', 'court', 'mobile'],
       ['film', 'england', 'champion', 'wales', 'seed'],
       ['software', 'music', 'club', 'users', 'security']], dtype=object)

## Завантаження корпусу austen-sense та його попередня обробка

In [9]:
from nltk.corpus import gutenberg

sense_corpus = gutenberg.sents('austen-sense.txt')

In [10]:
def preprocess_sentence(sentence):
    doc = ' '.join(sentence)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = tokenizer.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return filtered_tokens

In [11]:
sense_corpus = [preprocess_sentence(sentence) for sentence in sense_corpus]
print(sense_corpus[:5])

[['sense', 'sensibility', 'jane', 'austen'], ['chapter'], ['family', 'dashwood', 'long', 'settled', 'sussex'], ['estate', 'large', 'residence', 'norland', 'park', 'centre', 'property', 'many', 'generations', 'lived', 'respectable', 'manner', 'engage', 'general', 'good', 'opinion', 'surrounding', 'acquaintance'], ['late', 'owner', 'estate', 'single', 'man', 'lived', 'advanced', 'age', 'many', 'years', 'life', 'constant', 'companion', 'housekeeper', 'sister']]


## Топ-5 ключових біграм

In [12]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(sense_corpus)
finder.apply_freq_filter(5)
finder.nbest(bigram_measures.pmi, 5)

[('piano', 'forte'),
 ('bartlett', 'buildings'),
 ('combe', 'magna'),
 ('lock', 'hair'),
 ('burst', 'forth')]