In [2]:
import wikipedia
from term_frequency import compute_tf
from tf_idf import compute_idf, compute_tfidf
from cosine_similarity import cosine_similarity

# Define the topics
topics = [
    "Blockchain",
    "Social Media",
    "Virtual Reality",
    "Neuroscience",
    "Climate Change",  
]

# Fetch Wikipedia content 
documents = []
for topic in topics:
    try:
        page = wikipedia.page(topic, auto_suggest=False)
        content = page.content
        words = content.split()[:50]  
        limited_content = ' '.join(words)
        documents.append(limited_content)
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for {topic}: {e.options}")
        documents.append("")  
    except wikipedia.exceptions.PageError:
        print(f"PageError: {topic} not found")
        documents.append("")  


print("Documents:")
for i, doc in enumerate(documents):
    print(f"{doc} # document {i+1} ({topics[i]})")

# Tokenize and apply lowercase
tokenized_docs = [[word.lower() for word in doc.split()] for doc in documents]

# Create a set of unique words (vocabulary)
vocabulary = set(word for doc in tokenized_docs for word in doc)

# Display tokens per document and count total tokens
total_tokens = 0
print("\nTokens per Document:")
for i, doc_tokens in enumerate(tokenized_docs):
    print(f"Document {i+1} Tokens ({topics[i]}):")
    print(doc_tokens)
    print(f"Token count: {len(doc_tokens)}\n")
    total_tokens += len(doc_tokens)

print(f"Total number of tokens across all documents: {total_tokens}")




Documents:
A blockchain is a distributed ledger with growing lists of records (blocks) that are securely linked together via cryptographic hashes. Each block contains a cryptographic hash of the previous block, a timestamp, and transaction data (generally represented as a Merkle tree, where data nodes are represented by leaves). Since each # document 1 (Blockchain)
Social media are interactive technologies that facilitate the creation, sharing and aggregation of content (such as ideas, interests, and other forms of expression) amongst virtual communities and networks. Common features include: Online platforms that enable users to create and share content and participate in social networking. User-generated content—such as text # document 2 (Social Media)
Virtual reality (VR) is a simulated experience that employs 3D near-eye displays and pose tracking to give the user an immersive feel of a virtual world. Applications of virtual reality include entertainment (particularly video games),

In [3]:
# Compute the term frequency for each document
tf_vectors = [compute_tf(doc, vocabulary) for doc in tokenized_docs]

# Print TF vectors
print("\nTerm Frequency Vectors:")
for i, tf_vector in enumerate(tf_vectors):
    print(f"Document {i+1}: {tf_vector}")



Term Frequency Vectors:
Document 1: {'aggregation': 0.0, 'securely': 0.02, 'previous': 0.02, 'nodes': 0.02, 'training)': 0.0, 'change': 0.0, 'meetings).': 0.0, 'technologies': 0.0, 'immersive': 0.0, 'statistics,': 0.0, 'distributed': 0.02, 'increase': 0.0, 'present-day': 0.0, 'interests,': 0.0, 'that': 0.02, 'pose': 0.0, 'both': 0.0, 'forms': 0.0, 'networking.': 0.0, 'growing': 0.02, 'effects': 0.0, 'leaves).': 0.02, 'understand': 0.0, 'communities': 0.0, 'medicine,': 0.0, 'linked': 0.02, 'by': 0.02, 'users': 0.0, 'also': 0.0, '(particularly': 0.0, 'cryptographic': 0.04, 'includes': 0.0, 'lists': 0.02, 'modeling': 0.0, 'data': 0.04, 'transaction': 0.02, 'user': 0.0, 'or': 0.0, 'warming—the': 0.0, 'are': 0.04, 'near-eye': 0.0, 'functions,': 0.0, 'feel': 0.0, 'user-generated': 0.0, 'temperatures': 0.0, 'blockchain': 0.02, 'medical,': 0.0, 'amongst': 0.0, 'entertainment': 0.0, 'games),': 0.0, 'of': 0.04, 'education': 0.0, 'an': 0.0, 'ledger': 0.02, 'tree,': 0.02, 'world.': 0.0, '(general

In [4]:
# Compute the Inverse Document Frequency (IDF)
idf = compute_idf(tokenized_docs, vocabulary)
print("\nInverse Document Frequency:")
for term, idf_value in idf.items():
    print(f"{term}: {idf_value}")

# Compute TF-IDF vectors
tfidf_vectors = [compute_tfidf(tf, idf, vocabulary) for tf in tf_vectors]
print("\nTF-IDF Vectors:")
for i, tfidf_vector in enumerate(tfidf_vectors):
    print(f"Document {i+1}: {tfidf_vector}")


Inverse Document Frequency:
aggregation: 1.6094379124341003
securely: 1.6094379124341003
previous: 0.9162907318741551
nodes: 1.6094379124341003
training): 1.6094379124341003
change: 1.6094379124341003
meetings).: 1.6094379124341003
technologies: 1.6094379124341003
immersive: 1.6094379124341003
statistics,: 1.6094379124341003
distributed: 1.6094379124341003
increase: 1.6094379124341003
present-day: 1.6094379124341003
interests,: 1.6094379124341003
that: 0.22314355131420976
pose: 1.6094379124341003
both: 1.6094379124341003
forms: 1.6094379124341003
networking.: 1.6094379124341003
growing: 1.6094379124341003
effects: 1.6094379124341003
leaves).: 1.6094379124341003
understand: 1.6094379124341003
communities: 1.6094379124341003
medicine,: 1.6094379124341003
linked: 1.6094379124341003
by: 0.9162907318741551
users: 1.6094379124341003
also: 1.6094379124341003
(particularly: 1.6094379124341003
cryptographic: 1.6094379124341003
includes: 1.6094379124341003
lists: 1.6094379124341003
modeling: 1.

In [5]:
# Compute cosine similarity for all document pairs
print("\nCosine Similarity Between All Document Pairs:")
max_similarity = -1
most_similar_pair = (0, 0)
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        similarity = cosine_similarity(tfidf_vectors[i], tfidf_vectors[j], vocabulary)
        print(f"Similarity between Document {i+1} ({topics[i]}) and Document {j+1} ({topics[j]}): {similarity:.4f}")
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_pair = (i, j)

# Print the most similar pair
print(f"\nMost Similar Documents:")
print(f"Document {most_similar_pair[0]+1} ({topics[most_similar_pair[0]]}) and Document {most_similar_pair[1]+1} ({topics[most_similar_pair[1]]})")
print(f"Cosine Similarity: {max_similarity:.4f}")


Cosine Similarity Between All Document Pairs:
Similarity between Document 1 (Blockchain) and Document 2 (Social Media): 0.0255
Similarity between Document 1 (Blockchain) and Document 3 (Virtual Reality): 0.0129
Similarity between Document 1 (Blockchain) and Document 4 (Neuroscience): 0.0058
Similarity between Document 1 (Blockchain) and Document 5 (Climate Change): 0.0213
Similarity between Document 2 (Social Media) and Document 3 (Virtual Reality): 0.0664
Similarity between Document 2 (Social Media) and Document 4 (Neuroscience): 0.0095
Similarity between Document 2 (Social Media) and Document 5 (Climate Change): 0.0205
Similarity between Document 3 (Virtual Reality) and Document 4 (Neuroscience): 0.0065
Similarity between Document 3 (Virtual Reality) and Document 5 (Climate Change): 0.0016
Similarity between Document 4 (Neuroscience) and Document 5 (Climate Change): 0.0146

Most Similar Documents:
Document 2 (Social Media) and Document 3 (Virtual Reality)
Cosine Similarity: 0.0664


In [None]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report
import warnings


warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.linear_model._logistic")

# Step 1: Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=20,      
    window=3,            
    min_count=1,         
    sg=1,                
    epochs=10            
)

# Step 2: Create document vectors (average of word vectors)
def document_vector(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Generate document vectors
X = np.array([document_vector(doc, word2vec_model) for doc in tokenized_docs])

# Step 3: Assign labels (0 to 4 for 5 topics)
y = np.array(range(len(topics)))  # [0, 1, 2, 3, 4]

# Step 4: Logistic Regression with Leave-One-Out Cross-Validation
classifier = LogisticRegression(
    class_weight='balanced',    
    solver='lbfgs',             
    max_iter=1000,              
    C=0.5,                      
    random_state=42             
)

# Note: Results are based on a very small dataset (5 samples), limiting generalization. LOOCV is used due to insufficient samples for train-test split."
loo = LeaveOneOut()
y_pred = []
y_true = []
for train_idx, test_idx in loo.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    try:
        classifier.fit(X_train, y_train)
        y_pred.append(classifier.predict(X_test)[0])
        y_true.append(y_test[0])
    except Exception as e:
        print(f"Error in LOOCV fold: {e}")
        y_pred.append(-1)  
        y_true.append(y_test[0])

# Step 5: Evaluation
accuracy = accuracy_score(y_true, y_pred)
label_to_topic = {i: topics[i] for i in range(len(topics))}

# Print LOOCV results
print("\nLeave-One-Out Cross-Validation Results:")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=topics, zero_division=0))

# Predict on full dataset for document classifications
classifier.fit(X, y)  
y_pred_full = classifier.predict(X)

# Print document vectors 
print("\nDocument Vectors (first 5 dimensions):")
for i, vec in enumerate(X):
    first_5 = [round(val, 4) for val in vec[:5]]
    print(f"Document {i+1} ({topics[i]}): {first_5}")

# Print document classifications 
print("\nDocument Classifications (Full Dataset Predictions):")
for i, (topic, pred) in enumerate(zip(topics, y_pred_full)):
    print(f"Document {i+1} ({topic}): Predicted as {topics[pred]}")



Leave-One-Out Cross-Validation Results:

Classification Report:
                 precision    recall  f1-score   support

     Blockchain       0.00      0.00      0.00       1.0
   Social Media       0.00      0.00      0.00       1.0
Virtual Reality       0.00      0.00      0.00       1.0
   Neuroscience       0.00      0.00      0.00       1.0
 Climate Change       0.00      0.00      0.00       1.0

       accuracy                           0.00       5.0
      macro avg       0.00      0.00      0.00       5.0
   weighted avg       0.00      0.00      0.00       5.0


Document Vectors (first 5 dimensions):
Document 1 (Blockchain): [-0.0015, 0.0058, 0.0085, 0.0067, 0.0028]
Document 2 (Social Media): [-0.0041, 0.0007, 0.0093, 0.0025, -0.0023]
Document 3 (Virtual Reality): [-0.0018, -0.0079, 0.0019, 0.0092, -0.0031]
Document 4 (Neuroscience): [-0.0035, -0.0038, 0.0059, 0.0049, -0.0027]
Document 5 (Climate Change): [-0.0043, -0.0025, 0.0044, 0.0065, 0.0012]

Document Classifications