In [28]:
import wikipedia
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# List of topics you want to fetch from Wikipedia
topics = ["Python (programming language)", "Data science", "Football", "Basketball", "Machine learning"]

# Fetch content for each topic from Wikipedia
corpus = []
for topic in topics:
    try:
        page = wikipedia.page(topic)  # Fetch the Wikipedia page for the topic
        corpus.append(page.content)  # Get the content of the page
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error for {topic}: {e.options}")
        corpus.append("")  # Append empty string in case of ambiguity
    except wikipedia.exceptions.HTTPError:
        print(f"HTTP Error when fetching {topic}.")
        corpus.append("")  # Append empty string in case of error

# Term-document matrix using raw frequency
vectorizer = CountVectorizer()
raw_frequency_matrix = vectorizer.fit_transform(corpus).toarray()
raw_frequency_df = pd.DataFrame(raw_frequency_matrix, columns=vectorizer.get_feature_names_out())
print("Term-Document Matrix (Raw Frequency):")
print(raw_frequency_df)

# Term-document matrix using TF-IDF weights
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).toarray()
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())
print("\nTerm-Document Matrix (TF-IDF):")
print(tfidf_df)

# Cosine similarity between documents
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, columns=[f"Doc{i+1}" for i in range(len(corpus))], index=[f"Doc{i+1}" for i in range(len(corpus))])
print("\nCosine Similarity Matrix:")
print(cosine_sim_df)

# Find most similar documents
most_similar_docs = np.unravel_index(np.argmax(cosine_sim[np.triu_indices(len(corpus), k=1)]), cosine_sim.shape)
print(f"\nMost similar documents are Doc{most_similar_docs[0] + 1} and Doc{most_similar_docs[1] + 1}")


ModuleNotFoundError: No module named 'wikipedia'