In [None]:
import wikipedia
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# List of topics you want to fetch from Wikipedia
topics = ["Python (programming language)", "Data science", "Football", "Basketball", "Machine learning"]

# Fetch content for each topic from Wikipedia
corpus = []
for topic in topics:
    try:
        page = wikipedia.page(topic)  # Fetch the Wikipedia page for the topic
        corpus.append(page.content)  # Get the content of the page
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error for {topic}: {e.options}")
        corpus.append("")  # Append empty string in case of ambiguity
    except wikipedia.exceptions.HTTPError:
        print(f"HTTP Error when fetching {topic}.")
        corpus.append("")  # Append empty string in case of error

# 1. (20 points) Using Wikipedia as the corpus, obtain 5 different topics that will serve as your documents
#    and create a term-document matrix.
#    a. Term-document matrix using raw frequency.  
vectorizer = CountVectorizer()  # This part corresponds to creating the term-document matrix using raw frequency.
raw_frequency_matrix = vectorizer.fit_transform(corpus).toarray()  # Creating the raw frequency matrix
raw_frequency_df = pd.DataFrame(raw_frequency_matrix, columns=vectorizer.get_feature_names_out())
print("Term-Document Matrix (Raw Frequency):")
print(raw_frequency_df)

# b. Term-document matrix using TF-IDF weights.
tfidf_vectorizer = TfidfVectorizer()  # This part corresponds to creating the term-document matrix using TF-IDF.
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).toarray()  # Creating the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())
print("\nTerm-Document Matrix (TF-IDF):")
print(tfidf_df)

# c. Cosine similarity between documents.
cosine_sim = cosine_similarity(tfidf_matrix)  # This part corresponds to calculating the cosine similarity between documents.
cosine_sim_df = pd.DataFrame(cosine_sim, columns=[f"Doc{i+1}" for i in range(len(corpus))], index=[f"Doc{i+1}" for i in range(len(corpus))])
print("\nCosine Similarity Matrix:")
print(cosine_sim_df)

# d. Find most similar documents.
most_similar_docs = np.unravel_index(np.argmax(cosine_sim[np.triu_indices(len(corpus), k=1)]), cosine_sim.shape)  # This part corresponds to finding the most similar documents.
print(f"\nMost similar documents are Doc{most_similar_docs[0] + 1} and Doc{most_similar_docs[1] + 1}")





#THIS ARE THE RESULT THAT I RUN IN A DIFFERENT COMPUTER
# Term-Document Matrix (Raw Frequency):
#    000  06681  10  100  100x  106  1098155438  11  110  1174  ...  yukihiro 
# 0    0      0   9    0     1    0           1  10    0     0  ...         1   
# 1    0      0   0    0     0    0           0   0    0     0  ...         0   
# 2    1      0   1    0     0    1           0   1    0     1  ...         0   
# 3    4      1   3    1     0    0           0   1    1     0  ...         0   
# 4    0      0   0    0     0    0           0   0    0     0  ...         0   

#    zealand  zen  zero  zone  zones  zope  φαινίνδα  ἐπίσκυρος  蹴鞠  
# 0        0    2     2     0      0     1         0          0   0  
# 1        0    0     0     0      0     0         0          0   0  
# 2        8    0     0     0      1     0         1          1   1  
# 3        0    0     0     4      0     0         0          0   0  
# 4        0    0     0     0      0     0         0          0   0  

# [5 rows x 5114 columns]

# Term-Document Matrix (TF-IDF):
#        000     06681        10       100      100x       106  1098155438  
# 0  0.000000  0.000000  0.014472  0.000000  0.002401  0.000000    0.002401   
# 1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
# 2  0.001192  0.000000  0.000989  0.000000  0.000000  0.001477    0.000000   
# 3  0.004824  0.001495  0.003003  0.001495  0.000000  0.000000    0.000000   
# 4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   
# ...
# Doc4  0.637752  0.430567  0.808665  1.000000   0.0
# Doc5  0.000000  0.000000  0.000000  0.000000   0.0

# Most similar documents are Doc2 and Doc3


ModuleNotFoundError: No module named 'wikipedia'