In [36]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD

In [38]:
# Carregar o conjunto de dados da BBC News
df = pd.read_csv('C:/Users/jacks/OneDrive/Área de Trabalho/metodos/BBC News Test.csv')  # Substitua pelo caminho correto do arquivo CSV

In [39]:
print(df.head())  # Exibe as primeiras linhas do DataFrame
print()
print(df.columns)  # Exibe as colunas do DataFrame

   ArticleId                                               Text
0       1018  qpr keeper day heads for preston queens park r...
1       1319  software watching while you work software that...
2       1138  d arcy injury adds to ireland woe gordon d arc...
3        459  india s reliance family feud heats up the ongo...
4       1020  boro suffer morrison injury blow middlesbrough...

Index(['ArticleId', 'Text'], dtype='object')


In [47]:
# Pré-processamento dos dados
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['Text'])

In [41]:
# Aplicar a modelagem de tópicos com NMF
nmf_model = NMF(n_components=5, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(X)

In [42]:
# Obter as palavras mais importantes de cada tópico
feature_names = vectorizer.get_feature_names_out()
topic_words = []
for topic in nmf_model.components_:
    word_idx = topic.argsort()[:-11:-1]  # Obter os índices das palavras mais importantes
    topic_words.append([feature_names[i] for i in word_idx])

In [43]:
# Imprimir as palavras mais importantes de cada tópico
for i, words in enumerate(topic_words):
    print(f'Tópico {i+1}: {", ".join(words)}')

Tópico 1: game, england, rugby, win, play, wales, match, players, cup, team
Tópico 2: mr, labour, blair, election, brown, party, howard, chancellor, government, tory
Tópico 3: economy, growth, bank, year, prices, sales, economic, oil, 2004, china
Tópico 4: people, music, mobile, technology, digital, users, broadband, games, phone, computer
Tópico 5: film, best, award, awards, festival, oscar, prize, year, films, actress


In [44]:
# Aplicar a modelagem de tópicos com SVD
svd_model = TruncatedSVD(n_components=5, random_state=42)
svd_topic_matrix = svd_model.fit_transform(X)

In [45]:
# Obter as palavras mais importantes de cada tópico
topic_words = []
for topic in svd_model.components_:
    word_idx = topic.argsort()[:-11:-1]  # Obter os índices das palavras mais importantes
    topic_words.append([feature_names[i] for i in word_idx])

In [46]:
# Imprimir as palavras mais importantes de cada tópico
for i, words in enumerate(topic_words):
    print(f'Tópico {i+1}: {", ".join(words)}')

Tópico 1: mr, people, year, labour, government, new, election, blair, brown, party
Tópico 2: mr, labour, blair, election, brown, party, howard, chancellor, tory, government
Tópico 3: economy, growth, sales, bank, prices, market, oil, company, china, economic
Tópico 4: music, people, technology, mobile, digital, users, tv, film, phone, apple
Tópico 5: film, award, best, awards, festival, oscar, films, prize, actress, aviator
