In [7]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk import MWETokenizer, WordNetLemmatizer
from nltk.corpus import wordnet as wn, stopwords
import gensim
from gensim import corpora

In [8]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = re.sub(r'[^\w\s]',' ',text) # rimuovo la punteggiatura
    text = text.lower()
    text = nltk.pos_tag(text.split()) # prendo i pos tag delle parole (fa anche il tokenizing)
    text = [x for x in text if x[1] in ['NN','NNS','NNP','NNPS']] # mantengo solo i noun
    text = [x[0] for x in text] # rimuovo i pos tag
    text = [lemmatizer.lemmatize(x) for x in text]
    text = [x for x in text if x not in stop_words] # rimuovo le stop words
    return text

In [9]:
paths = [
    "documents\\bbc\\entertainment", 
    "documents\\bbcsport\\athletics", 
    "documents\\bbcsport\\cricket", 
    "documents\\bbcsport\\football", 
    "documents\\bbcsport\\rugby",
    "documents\\bbcsport\\tennis"
]

documents = []

for path in paths:
    for file_name in os.listdir(path):
        if os.path.isfile(os.path.join(path, file_name)):
            file = open(path + "/" + file_name, "r", encoding="utf-8")
            document = preprocessing(file.read())
            documents.append(document)

In [10]:
# crea un array test_documents con il 10% dei documenti e rimuove i documenti di test da documents
test_documents = []
training_documents = documents.copy()

for i in range(0, int(len(documents) * 0.1)):
    random_index = np.random.randint(0, len(training_documents)-1)
    test_documents.append(training_documents[random_index])
    training_documents.pop(random_index)

print("Numero di documenti di training: " + str(len(training_documents)))
print("Numero di documenti di test: " + str(len(test_documents)))
print("Numero di documenti totali: " + str(len(documents)))

Numero di documenti di training: 779
Numero di documenti di test: 86
Numero di documenti totali: 865


In [11]:
# Creo il dizionario
dictionary = corpora.Dictionary(training_documents)
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)  # use Dictionary to remove un-relevant tokens

# Creo la rappresentazione del corpus
corpus = [dictionary.doc2bow(doc) for doc in training_documents]

# Definisco il modello LDA
k = 15  # Numero di topic da identificare
lda_model = gensim.models.LdaModel(corpus, num_topics=k, id2word=dictionary)

# Visualizzazione dei topic identificati
for topic_id, topic in lda_model.show_topics(formatted=True, num_topics=k, num_words=10):
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.019*"england" + 0.016*"test" + 0.010*"club" + 0.007*"woman" + 0.006*"kenteris" + 0.006*"week" + 0.006*"sport" + 0.006*"iaaf" + 0.006*"month" + 0.006*"chelsea"
Topic 1: 0.012*"film" + 0.010*"club" + 0.010*"manager" + 0.007*"man" + 0.007*"cup" + 0.007*"minute" + 0.006*"half" + 0.006*"rugby" + 0.005*"star" + 0.005*"france"
Topic 2: 0.012*"club" + 0.012*"cup" + 0.011*"football" + 0.011*"number" + 0.008*"people" + 0.007*"action" + 0.007*"month" + 0.007*"bos" + 0.006*"thing" + 0.006*"jones"
Topic 3: 0.009*"jones" + 0.008*"minute" + 0.007*"way" + 0.007*"test" + 0.007*"break" + 0.006*"cricket" + 0.006*"england" + 0.006*"point" + 0.006*"win" + 0.006*"wale"
Topic 4: 0.010*"minute" + 0.008*"ball" + 0.008*"england" + 0.008*"chance" + 0.007*"champion" + 0.007*"goal" + 0.006*"home" + 0.005*"liverpool" + 0.005*"film" + 0.005*"point"
Topic 5: 0.023*"england" + 0.010*"injury" + 0.010*"wale" + 0.009*"goal" + 0.009*"nation" + 0.008*"robinson" + 0.007*"club" + 0.007*"music" + 0.007*"cup" + 0.00

In [12]:
# Inferenza dei topic per un nuovo documento
flattened_test_documents = [token for sublist in test_documents for token in sublist]
new_bow = dictionary.doc2bow(flattened_test_documents)
topic_distribution = lda_model.get_document_topics(new_bow)

print("Topic distribution for new document:")
for topic_id, topic_prob in topic_distribution:
    print(f"Topic {topic_id}: {topic_prob}")


Topic distribution for new document:
Topic 0: 0.08125948160886765
Topic 1: 0.04164519160985947
Topic 4: 0.05920872837305069
Topic 5: 0.048244908452034
Topic 6: 0.01747971773147583
Topic 7: 0.02704409882426262
Topic 8: 0.024588219821453094
Topic 9: 0.03239960968494415
Topic 10: 0.03977017104625702
Topic 12: 0.045488741248846054
Topic 13: 0.3618888258934021
Topic 14: 0.21740493178367615
