In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
nlp=spacy.load("en_core_web_sm")


# sample manual document
doc =[
    "Hockey, the thrilling game of speed and skill, unites fans with its electrifying goals and fierce competition. From icy rinks to roaring arenas, it’s a sport of pure adrenaline!",
    "Cricket, the gentleman's game, blends strategy and skill in every swing of the bat. From thrilling chases to stunning wickets, it unites fans worldwide."
]

In [7]:
#create a dataframe to hold the manual document
pd.set_option('display.max_colwidth',100)
data=pd.DataFrame({'text':doc})
data

Unnamed: 0,text
0,"Hockey, the thrilling game of speed and skill, unites fans with its electrifying goals and fierc..."
1,"Cricket, the gentleman's game, blends strategy and skill in every swing of the bat. From thrilli..."


In [12]:
def preprocessing(text):
    doc = nlp(text)
    # Remove stopwords and keep only alphabetic tokens, then lemmatize
    preprocess_token = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(preprocess_token)

In [13]:
data["preprocess_text"]=data["text"].apply(preprocessing)
data

Unnamed: 0,text,preprocess_text
0,"Hockey, the thrilling game of speed and skill, unites fans with its electrifying goals and fierc...",Hockey thrilling game speed skill unite fan electrify goal fierce competition icy rink roar aren...
1,"Cricket, the gentleman's game, blends strategy and skill in every swing of the bat. From thrilli...",Cricket gentleman game blend strategy skill swing bat thrill chase stunning wicket unite fan wor...


# Create TF-IDF Vectorizer and fit the model

In [21]:
#Create tf-idf vectorizer
vector=TfidfVectorizer()

#transform the manusl doc to tf-idf doc
X=vector.fit_transform(data["preprocess_text"])

#create latent dirichlet allocation model
lda=LatentDirichletAllocation(n_components=2)

# fit the model to the tf-idf
lda.fit(X)


#print the topics

print(lda.components_)

[[0.74349176 0.74349176 0.50653084 0.50653084 0.50653084 0.74349176
  0.50653084 0.74349176 0.67764657 0.74349176 0.67764657 0.50653084
  0.74349176 0.74349176 0.74349176 0.74349176 0.74349176 0.74349176
  0.67764657 0.74349176 0.74349176 0.50653084 0.50653084 0.50653084
  0.50653084 0.74349176 0.67764657 0.50653084 0.50653084]
 [0.50631335 0.50631335 0.77055322 0.77055322 0.77055322 0.50631335
  0.77055322 0.50631335 0.69723941 0.50631335 0.69723941 0.77055322
  0.50631335 0.50631335 0.50631335 0.50631335 0.50631335 0.50631335
  0.69723941 0.50631335 0.50631335 0.77055322 0.77055322 0.77055322
  0.77055322 0.50631335 0.69723941 0.77055322 0.77055322]]


# print the topics and their  associated word

In [27]:
# for idx, topic in enumerate(lda.components_):
#     print(f"Topic {idx+1}")
#     #get the top 5 words with highest weights for this topic
#     top_words_idx=topic.argsort()[-:5][-::1]
#     top_word=[vector.get_feature_names_out()[i] for i in top_words_idx]
#     print(", ".join(top_word))
#     print()
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}")
    # Get the top 5 words with the highest weights for this topic
    top_words_idx = topic.argsort()[-5:][::-1]  # Correct slicing and reverse for descending order
    top_words = [vector.get_feature_names_out()[i] for i in top_words_idx]
    print(", ".join(top_words))

Topic 1
adrenaline, competition, roar, thrilling, rink
Topic 2
strategy, gentleman, swing, stunning, bat
