In [51]:
import wikipedia
import math
from collections import defaultdict

# Function definitions to match main.py's external modules
def compute_tf(doc, vocab):
    tf = defaultdict(float)
    word_count = len(doc)
    for word in doc:
        if word in vocab:
            tf[word] += 1
    for word in tf:
        tf[word] = tf[word] / word_count if word_count > 0 else 0
    return {word: tf.get(word, 0) for word in vocab}

def compute_idf(docs, vocab):
    idf = {}
    N = len(docs)
    for word in vocab:
        docs_with_word = sum(1 for doc in docs if word in doc)
        idf[word] = math.log(N / (docs_with_word + 1))  # Add 1 to avoid division by zero
    return idf

def compute_tfidf(tf, idf, vocab):
    tfidf = {}
    for word in vocab:
        tfidf[word] = tf[word] * idf[word]
    return tfidf

def cosine_similarity(vec1, vec2, vocab):
    dot_product = sum(vec1[word] * vec2[word] for word in vocab)
    norm1 = math.sqrt(sum(val ** 2 for val in vec1.values()))
    norm2 = math.sqrt(sum(val ** 2 for val in vec2.values()))
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

# Define the topics
topics = [
    "Blockchain",
    "Artificial Intelligence",
    "Virtual Reality",
    "Neuroscience",
    "Quantum Computing"  # Replaced duplicate "Virtual Reality" with a unique topic
]

# Fetch Wikipedia content (first 5 words per topic)
documents = []
for topic in topics:
    try:
        page = wikipedia.page(topic, auto_suggest=False)
        content = page.content
        words = content.split()[:5]  # Limit to 5 words as in your code
        limited_content = ' '.join(words)
        documents.append(limited_content)
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for {topic}: {e.options}")
        documents.append("")  # Append empty string on error
    except wikipedia.exceptions.PageError:
        print(f"PageError: {topic} not found")
        documents.append("")  # Append empty string on error

# Print documents
print("Documents:")
for i, doc in enumerate(documents):
    print(f"{doc} # document {i+1} ({topics[i]})")

# Tokenize and apply lowercase
tokenized_docs = [[word.lower() for word in doc.split()] for doc in documents]

# Create a set of unique words (vocabulary)
vocabulary = set(word for doc in tokenized_docs for word in doc)

# Compute the term frequency for each document
tf_vectors = [compute_tf(doc, vocabulary) for doc in tokenized_docs]

# Print TF vectors
print("\nTerm Frequency Vectors:")
for i, tf_vector in enumerate(tf_vectors):
    print(f"Document {i+1}: {tf_vector}")



Documents:
A blockchain is a distributed # document 1 (Blockchain)
Artificial intelligence (AI) refers to # document 2 (Artificial Intelligence)
Virtual reality (VR) is a # document 3 (Virtual Reality)
Neuroscience is the scientific study # document 4 (Neuroscience)
A quantum computer is a # document 5 (Quantum Computing)

Term Frequency Vectors:
Document 1: {'virtual': 0, 'study': 0, 'computer': 0, 'artificial': 0, 'distributed': 0.2, '(ai)': 0, '(vr)': 0, 'the': 0, 'reality': 0, 'refers': 0, 'neuroscience': 0, 'scientific': 0, 'quantum': 0, 'to': 0, 'is': 0.2, 'intelligence': 0, 'blockchain': 0.2, 'a': 0.4}
Document 2: {'virtual': 0, 'study': 0, 'computer': 0, 'artificial': 0.2, 'distributed': 0, '(ai)': 0.2, '(vr)': 0, 'the': 0, 'reality': 0, 'refers': 0.2, 'neuroscience': 0, 'scientific': 0, 'quantum': 0, 'to': 0.2, 'is': 0, 'intelligence': 0.2, 'blockchain': 0, 'a': 0}
Document 3: {'virtual': 0.2, 'study': 0, 'computer': 0, 'artificial': 0, 'distributed': 0, '(ai)': 0, '(vr)': 0.2

In [52]:
# Compute the Inverse Document Frequency (IDF)
idf = compute_idf(tokenized_docs, vocabulary)
print("\nInverse Document Frequency:")
for term, idf_value in idf.items():
    print(f"{term}: {idf_value}")

# Compute TF-IDF vectors
tfidf_vectors = [compute_tfidf(tf, idf, vocabulary) for tf in tf_vectors]
print("\nTF-IDF Vectors:")
for i, tfidf_vector in enumerate(tfidf_vectors):
    print(f"Document {i+1}: {tfidf_vector}")


Inverse Document Frequency:
virtual: 0.9162907318741551
study: 0.9162907318741551
computer: 0.9162907318741551
artificial: 0.9162907318741551
distributed: 0.9162907318741551
(ai): 0.9162907318741551
(vr): 0.9162907318741551
the: 0.9162907318741551
reality: 0.9162907318741551
refers: 0.9162907318741551
neuroscience: 0.9162907318741551
scientific: 0.9162907318741551
quantum: 0.9162907318741551
to: 0.9162907318741551
is: 0.0
intelligence: 0.9162907318741551
blockchain: 0.9162907318741551
a: 0.22314355131420976

TF-IDF Vectors:
Document 1: {'virtual': 0.0, 'study': 0.0, 'computer': 0.0, 'artificial': 0.0, 'distributed': 0.18325814637483104, '(ai)': 0.0, '(vr)': 0.0, 'the': 0.0, 'reality': 0.0, 'refers': 0.0, 'neuroscience': 0.0, 'scientific': 0.0, 'quantum': 0.0, 'to': 0.0, 'is': 0.0, 'intelligence': 0.0, 'blockchain': 0.18325814637483104, 'a': 0.08925742052568392}
Document 2: {'virtual': 0.0, 'study': 0.0, 'computer': 0.0, 'artificial': 0.18325814637483104, 'distributed': 0.0, '(ai)': 0.

In [53]:
# Compute cosine similarity for all document pairs
print("\nCosine Similarity Between All Document Pairs:")
max_similarity = -1
most_similar_pair = (0, 0)
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        similarity = cosine_similarity(tfidf_vectors[i], tfidf_vectors[j], vocabulary)
        print(f"Similarity between Document {i+1} ({topics[i]}) and Document {j+1} ({topics[j]}): {similarity:.4f}")
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pair = (i, j)

# Print the most similar pair
print(f"\nMost Similar Documents:")
print(f"Document {most_similar_pair[0]+1} ({topics[most_similar_pair[0]]}) and Document {most_similar_pair[1]+1} ({topics[most_similar_pair[1]]})")
print(f"Cosine Similarity: {max_similarity:.4f}")


Cosine Similarity Between All Document Pairs:
Similarity between Document 1 (Blockchain) and Document 2 (Artificial Intelligence): 0.0000
Similarity between Document 1 (Blockchain) and Document 3 (Virtual Reality): 0.0453
Similarity between Document 1 (Blockchain) and Document 4 (Neuroscience): 0.0000
Similarity between Document 1 (Blockchain) and Document 5 (Quantum Computing): 0.1060
Similarity between Document 2 (Artificial Intelligence) and Document 3 (Virtual Reality): 0.0000
Similarity between Document 2 (Artificial Intelligence) and Document 4 (Neuroscience): 0.0000
Similarity between Document 2 (Artificial Intelligence) and Document 5 (Quantum Computing): 0.0000
Similarity between Document 3 (Virtual Reality) and Document 4 (Neuroscience): 0.0000
Similarity between Document 3 (Virtual Reality) and Document 5 (Quantum Computing): 0.0453
Similarity between Document 4 (Neuroscience) and Document 5 (Quantum Computing): 0.0000

Most Similar Documents:
Document 1 (Blockchain) and Do

In [55]:
import wikipedia
import numpy as np
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, confusion_matrix
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")  # Suppress convergence warnings

# Ensure compatible versions to avoid numpy error
# Run: pip install numpy==1.26.4 gensim==4.3.3 wikipedia-api scikit-learn nltk

# Download NLTK data (for tokenization)
nltk.download('punkt')

# Define the topics and labels
topics = [
    "Blockchain",
    "Artificial Intelligence",
    "Virtual Reality",
    "Neuroscience",
    "Quantum Computing"
]
labels = list(range(len(topics)))  # Numeric labels: 0, 1, 2, 3, 4

# Fetch Wikipedia content (first 5 words per topic)
documents = []
for topic in topics:
    try:
        page = wikipedia.page(topic, auto_suggest=False)
        content = page.content
        words = content.split()[:5]
        limited_content = ' '.join(words)
        documents.append(limited_content)
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for {topic}: {e.options}")
        documents.append("")
    except wikipedia.exceptions.PageError:
        print(f"PageError: {topic} not found")
        documents.append("")

# Print documents
print("Documents:")
for i, doc in enumerate(documents):
    print(f"{doc} # document {i+1} ({topics[i]})")

# Tokenize documents
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]

# Load pre-trained Word2Vec model (Mikolov et al.)
try:
    # Download from: https://code.google.com/archive/p/word2vec/
    # File: GoogleNews-vectors-negative300.bin (~3.4 GB)
    word2vec_path = "GoogleNews-vectors-negative300.bin"
    word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    print("Loaded pre-trained Word2Vec model (Mikolov et al.).")
except FileNotFoundError:
    print(f"Word2Vec file {word2vec_path} not found. Please download from https://code.google.com/archive/p/word2vec/")
    print("Alternatively, use gensim.downloader:")
    print("import gensim.downloader as api; word2vec_model = api.load('word2vec-google-news-300')")
    exit(1)

# Function to create document vectors by averaging Word2Vec embeddings
def document_vector(doc, model, embedding_dim=300):
    vectors = [model[word] for word in doc if word in model]
    if not vectors:  # Handle empty documents or out-of-vocabulary words
        return np.zeros(embedding_dim)
    return np.mean(vectors, axis=0)

# Create document vectors
doc_vectors = [document_vector(doc, word2vec_model) for doc in tokenized_docs]

# Print document vectors (first 5 dimensions for brevity)
print("\nDocument Vectors (first 5 dimensions):")
for i, vec in enumerate(doc_vectors):
    print(f"Document {i+1} ({topics[i]}): {vec[:5]}...")

# Logistic Regression Configuration
logistic_model = LogisticRegression(
    multi_class='multinomial',  # For multi-class classification
    solver='lbfgs',  # Suitable for small datasets
    C=1.0,  # Regularization strength
    max_iter=1000,  # Ensure convergence
    random_state=42
)

# Evaluate using Leave-One-Out Cross-Validation
loo = LeaveOneOut()
predictions = []
true_labels = []

for train_index, test_index in loo.split(doc_vectors):
    X_train = [doc_vectors[i] for i in train_index]
    y_train = [labels[i] for i in train_index]
    X_test = [doc_vectors[i] for i in test_index]
    y_test = [labels[i] for i in test_index]
    logistic_model.fit(X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    predictions.append(y_pred[0])
    true_labels.append(y_test[0])

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"\nLeave-One-Out Cross-Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:")
print(f"{'':>20} {'Predicted':>30}")
print(f"{'':>20} {' '.join([f'{t[:5]:>5}' for t in topics])}")
for i, row in enumerate(conf_matrix):
    print(f"True {topics[i][:5]:>15} {' '.join([f'{val:>5}' for val in row])}")

# Sample prediction
sample_doc = tokenized_docs[0]  # Blockchain
sample_vector = document_vector(sample_doc, word2vec_model)
predicted_label = logistic_model.fit(doc_vectors, labels).predict([sample_vector])[0]
print(f"\nSample Prediction for Document 1 ({topics[0]}):")
print(f"Predicted Topic: {topics[predicted_label]}")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject