In [None]:
from langchain.retrievers import WikipediaRetriever
import re
from nltk.corpus import stopwords
from tqdm import tqdm

### Fetching documents from Wikipedia

In [5]:
retriever = WikipediaRetriever()

query = ["Politics", "Health", "Education", "Sports"]

In [33]:
def remove_special_characters(text):
    # Use regex to remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower() # lowercase
    # text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'\s+',' ',text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]','',text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

In [11]:
docs = []

for q in tqdm(query):
    doc = retriever.get_relevant_documents(query=q)[0]
    no_spec = remove_special_characters(doc.dict()["page_content"])
    docs.append(no_spec)

100%|██████████| 4/4 [00:36<00:00,  9.18s/it]


In [12]:
docs

['politics ancient greek politik affairs cities set activities associated making decisions groups forms power relations among individuals distribution resources status branch social science studies politics government referred political science may used positively context political solution compromising nonviolent descriptively art science government also often carries negative connotation concept defined various ways different approaches fundamentally differing views whether used extensively limited way empirically normatively whether conflict cooperation essential variety methods deployed politics include promoting ones political views among people negotiation political subjects making laws exercising internal external force including warfare adversaries politics exercised wide range social levels clans tribes traditional societies modern local governments companies institutions sovereign states international level modern nation states people often form political parties represent id

### Unigram Count Matrix

In [13]:
def create_unigram_count_matrix(corpus):
    tokens = [document.split() for document in corpus]

    vocabulary = list(set(word for document in tokens for word in document))

    count_matrix = [[0] * len(vocabulary) for _ in range(len(corpus))]

    for i, document in enumerate(tokens):
        for word in document:
            if word in vocabulary:
                count_matrix[i][vocabulary.index(word)] += 1

    return count_matrix, vocabulary

In [14]:
unigram_count_matrix, vocabulary = create_unigram_count_matrix(docs)

print("Unigram Count Matrix:")
for row in unigram_count_matrix:
    print(row)

print("\nVocabulary:")
print(vocabulary)

Unigram Count Matrix:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 3, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 12, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 3, 0, 1, 1, 1, 1, 1, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 1, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 3, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0

### Bigram Matrix

In [17]:
def create_bigram_count_matrix(corpus):
    # Tokenize the corpus into words
    tokens = [document.split() for document in corpus]

    bigrams = [tuple(tokens[i][j:j+2]) for i in range(len(tokens)) for j in range(len(tokens[i]) - 1)]

    vocabulary = list(set(bigrams))

    count_matrix = [[0] * len(vocabulary) for _ in range(len(corpus))]

    for i, document in enumerate(tokens):
        document_bigrams = [tuple(document[j:j+2]) for j in range(len(document) - 1)]
        for bigram in document_bigrams:
            if bigram in vocabulary:
                count_matrix[i][vocabulary.index(bigram)] += 1

    return count_matrix, vocabulary

In [18]:
bigram_count_matrix, vocabulary = create_bigram_count_matrix(docs)

print("Bigram Count Matrix:")
for row in bigram_count_matrix:
    print(row)

print(f"\nvocabulary of len : {len(vocabulary)}")
print(vocabulary)

Bigram Count Matrix:
[1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 

In [19]:
import numpy as np

def create_bigram_prob_matrix(count_matrix, vocabulary):

    prob_matrix = np.zeros((len(count_matrix), len(vocabulary)))
    
    # Calculate sum of counts per document
    doc_sums = np.sum(count_matrix, axis=1)
    
    # Calculate bigram probabilities
    for i in range(len(count_matrix)):
        for j in range(len(vocabulary)):
            bigram = vocabulary[j]
            count = count_matrix[i][j]
            
            prob_matrix[i][j] = count / doc_sums[i]
            
    return prob_matrix

In [34]:
count_matrix, vocabulary = create_bigram_count_matrix(docs) 

prob_matrix = create_bigram_prob_matrix(count_matrix, vocabulary)

In [36]:
prob_matrix.shape

(4, 1354)

### TF-IDF matrix

In [26]:
import math

In [27]:
def calculate_term_frequency(document):
    term_frequency = {}
    total_terms = len(document)

    for term in document:
        term_frequency[term] = term_frequency.get(term, 0) + 1

    for term, count in term_frequency.items():
        term_frequency[term] = count / total_terms

    return term_frequency

In [28]:
def calculate_inverse_document_frequency(corpus, term):
    document_count = sum(1 for document in corpus if term in document)
    if document_count > 0:
        return math.log(len(corpus) / document_count)
    else:
        return 0.0

In [29]:
def create_tfidf_matrix(corpus):
    # Tokenize the corpus into words
    tokens = [document.split() for document in corpus]

    # Get unique terms (words) from the corpus
    vocabulary = list(set(word for document in tokens for word in document))

    # Initialize TF-IDF matrix
    tfidf_matrix = []

    # Calculate TF-IDF values for each document and term
    for document in tokens:
        tfidf_vector = []
        term_frequency = calculate_term_frequency(document)

        for term in vocabulary:
            tf = term_frequency.get(term, 0)
            idf = calculate_inverse_document_frequency(tokens, term)
            tfidf_vector.append(tf * idf)

        tfidf_matrix.append(tfidf_vector)

    return tfidf_matrix, vocabulary

In [30]:
tfidf_matrix, vocabulary = create_tfidf_matrix(docs)

# Display the TF-IDF matrix and vocabulary
print("TF-IDF Matrix:")
for row in tfidf_matrix:
    print(row)

print("\nVocabulary:")
print(vocabulary)


TF-IDF Matrix:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0, 0.0020326896790614233, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008130758716245693, 0.0, 0.0, 0.0, 0.0, 0.0040653793581228465, 0.0040653793581228465, 0.0, 0.0, 0.0040653793581228465, 0.0020326896790614233, 0.0016872848824151369, 0.0, 0.0, 0.0040653793581228465, 0.0040653793581228465, 0.0, 0.0040653793581228465, 0.0, 0.0, 0.0020326896790614233, 0.0, 0.0, 0.0040653793581228465, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0025309273236227055, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0040653793581228465, 0.0020326896790614233, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0020326896790614233, 0.0, 0.0, 0.0, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0020326896790614233, 0.0, 0.0040653793581228465, 0.0020326896790614233, 0.0020326896790614233, 0.0, 0.0, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0040653793581228465, 0.0040653793581228465, 0.0, 0.0, 0.0, 0.0, 0.0,

### Naive Bayes Classification

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [37]:
labels = [0,0,1,1]

# Split the data into training and testing sets
matrices = [unigram_count_matrix, bigram_count_matrix, tfidf_matrix]
matrix_labels = ['Unigram', 'Bigram', 'Tf-IDF']

for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(matrices[i], labels, test_size=0.25, random_state=42)

    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, predictions)
    precision = metrics.precision_score(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)

    print(f"\n\nPerformance in {matrix_labels[i]} matrix")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)



Performance in Unigram matrix
Accuracy: 1.0
Precision: 0.0
Recall: 0.0


Performance in Bigram matrix
Accuracy: 1.0
Precision: 0.0
Recall: 0.0


Performance in Tf-IDF matrix
Accuracy: 0.0
Precision: 0.0
Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
