In [24]:
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np

In [37]:
Sports_URLs = ['https://en.wikipedia.org/wiki/Association_football',
        'https://en.wikipedia.org/wiki/Cricket',
        'https://en.wikipedia.org/wiki/Badminton',
        'https://en.wikipedia.org/wiki/Basketball',
        'https://en.wikipedia.org/wiki/Hockey']

Education_URLs = ['https://en.wikipedia.org/wiki/School',
        'https://en.wikipedia.org/wiki/College',
        'https://en.wikipedia.org/wiki/University',
        'https://en.wikipedia.org/wiki/Professor',
        'https://en.wikipedia.org/wiki/Teacher']

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    
sports_content = []
education_content = []

for url in Sports_URLs:
    content = get_page_content(url)
    if content:
        sports_content.append(content)

for url in Education_URLs:
    content = get_page_content(url)
    if content:
        education_content.append(content)



In [38]:
def clean_text(html_content):
   
    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = re.sub(r'<.*?>', '', soup.get_text(separator=' ', strip=True))
    text_content = re.sub(r'[^a-zA-Z\s]', '', text_content)

    text_content = re.sub(r'\s+', ' ', text_content)

    return text_content

cleaned_sports_content = []

for content in sports_content:
    cleaned_content = clean_text(content)
    cleaned_sports_content.append(cleaned_content)

cleaned_education_content = []

for content in education_content:
    cleaned_content = clean_text(content)
    cleaned_education_content.append(cleaned_content)
    
combined_list = cleaned_sports_content + cleaned_education_content



In [39]:
cleaned_sports_content

['Association football Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Donate Contribute Help Learn to edit Community portal Recent changes Upload file Languages Language links are at the top of the page Search Search Create account Log in Personal tools Create account Log in Pages for logged out editors learn more Contributions Talk Contents move to sidebar hide Top Name History Toggle History subsection Womens association football Gameplay Laws Toggle Laws subsection Players equipment and officials Ball Pitch Duration and tiebreaking methods minute ordinary time Tiebreaking Ball in and out of play Misconduct Onfield Offfield Governing bodies International competitions Domestic competitions See also Notes References External links Toggle the table of contents Association football languages Ach Afrikaans Alemannisch Anarkiel nglisc Aragons Arpetan Asturianu Avae Aymar aru Azrbaycan

In [40]:
import pandas as pd

df = pd.DataFrame({
    "text": cleaned_sports_content + cleaned_education_content,
    "category": ["Sports"]*len(cleaned_sports_content)+["Education"]*len(cleaned_education_content)
})

df

Unnamed: 0,text,category
0,Association football Wikipedia Jump to content...,Sports
1,Cricket Wikipedia Jump to content Main menu Ma...,Sports
2,Badminton Wikipedia Jump to content Main menu ...,Sports
3,Basketball Wikipedia Jump to content Main menu...,Sports
4,Hockey Wikipedia Jump to content Main menu Mai...,Sports
5,School Wikipedia Jump to content Main menu Mai...,Education
6,College Wikipedia Jump to content Main menu Ma...,Education
7,University Wikipedia Jump to content Main menu...,Education
8,Professor Wikipedia Jump to content Main menu ...,Education
9,Teacher Wikipedia Jump to content Main menu Ma...,Education


In [41]:
from collections import Counter
import numpy as np

def get_unigram_counts(texts):
    unigram_counts = Counter()
    for text in texts:
        unigram_counts.update(text.split())
    return unigram_counts

unigram_counts = get_unigram_counts(df["text"])
unigram_counts

def get_unigram_count_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

unigram_count_matrix = get_unigram_count_matrix(df["text"], unigram_counts)
unigram_count_matrix

unigram_count_df = pd.DataFrame(unigram_count_matrix, columns=unigram_counts.keys())
unigram_count_df

Unnamed: 0,Association,football,Wikipedia,Jump,to,content,Main,menu,move,sidebar,...,Elsbree,Democracy,Parkerson,Transitions,Jo,Ann,OECDs,GPS,httpsenwikipediaorgwindexphptitleTeacheroldid,LNB
0,52.0,178.0,10.0,1.0,171.0,2.0,14.0,2.0,6.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,19.0,8.0,1.0,224.0,2.0,26.0,2.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,10.0,1.0,198.0,2.0,5.0,2.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30.0,18.0,8.0,3.0,215.0,2.0,16.0,2.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,16.0,7.0,1.0,64.0,2.0,10.0,2.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,7.0,1.0,90.0,5.0,7.0,2.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,7.0,1.0,176.0,2.0,10.0,2.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,9.0,1.0,139.0,3.0,7.0,2.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,7.0,1.0,44.0,2.0,4.0,2.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5.0,0.0,8.0,1.0,153.0,3.0,17.0,2.0,4.0,3.0,...,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
def get_bigram_counts(texts):
    bigram_counts = Counter()
    for text in texts:
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts.update(bigrams)
    return bigram_counts

bigram_counts = get_bigram_counts(df["text"])
bigram_counts

def get_bigram_prob_matrix(texts, bigram_counts, unigram_counts):
    matrix = np.zeros((len(texts), len(bigram_counts)))
    for i, text in enumerate(texts):
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        for j, bigram in enumerate(bigram_counts):
            matrix[i, j] = bigram_counts[bigram]/unigram_counts[bigram[0]]
    return matrix

bigram_prob_matrix = get_bigram_prob_matrix(df["text"], bigram_counts, unigram_counts)
bigram_prob_matrix

bigram_prob_df = pd.DataFrame(bigram_prob_matrix, columns=bigram_counts.keys())
bigram_prob_df

Unnamed: 0_level_0,Association,football,Wikipedia,Jump,to,content,Main,menu,menu,move,...,httpsenwikipediaorgwindexphptitleTeacheroldid,Categories,occupations,Positions,of,authority,research,September,with,LNB
Unnamed: 0_level_1,football,Wikipedia,Jump,to,content,Main,menu,Main,move,to,...,Categories,Teaching,Positions,of,authority,Hidden,Articles,Commons,LNB,identifiers
0,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
1,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
2,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
3,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
4,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
5,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
6,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
7,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
8,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0
9,0.285714,0.00431,0.123457,0.833333,0.006784,0.4,0.172414,0.5,0.5,0.666667,...,1.0,0.111111,0.2,0.142857,0.000365,0.076923,0.017544,0.008065,0.001692,1.0


In [43]:
def get_tf_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

tf_matrix = get_tf_matrix(df["text"], unigram_counts)
tf_matrix

def get_idf_vector(texts, unigram_counts):
    idf_vector = np.zeros(len(unigram_counts))
    for j, word in enumerate(unigram_counts):
        idf_vector[j] = np.log(len(texts)/sum([1 for text in texts if word in text]))
    return idf_vector

idf_vector = get_idf_vector(df["text"], unigram_counts)
idf_vector

def get_tfidf_matrix(tf_matrix, idf_vector):
    return tf_matrix*idf_vector

tfidf_matrix = get_tfidf_matrix(tf_matrix, idf_vector)
tfidf_matrix

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unigram_counts.keys())
tfidf_df

Unnamed: 0,Association,football,Wikipedia,Jump,to,content,Main,menu,move,sidebar,...,Elsbree,Democracy,Parkerson,Transitions,Jo,Ann,OECDs,GPS,httpsenwikipediaorgwindexphptitleTeacheroldid,LNB
0,26.562932,123.380198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.510826,13.169796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.510826,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15.324769,12.476649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.021651,11.090355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2.554128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.302585,2.302585,6.907755,2.302585,0.0,0.693147,2.302585,2.302585,2.302585,2.302585


In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [45]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [46]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [49]:
import numpy as np
class NaiveBayesClassifier:
    def _init_(self):
        self.class_prior = None
        self.word_likelihoods = None
        self.vocab_size = None
    
    def fit(self, X_train, y_train):
        # Calculate class priors
        self.classes, class_counts = np.unique(y_train, return_counts=True)
        self.class_prior = class_counts / len(y_train)
        
        # Calculate word likelihoods
        self.vocab_size = X_train.shape[1]
        self.word_likelihoods = np.zeros((len(self.classes), self.vocab_size))
        for i, c in enumerate(self.classes):
            class_docs = X_train[y_train == c]
            total_words_in_class = np.sum(class_docs)
            self.word_likelihoods[i] = (np.sum(class_docs, axis=0) + 1) / (total_words_in_class + self.vocab_size)
    
    def predict(self, X_test):
        predictions = []
        for doc in X_test:
            posteriors = []
            for i, c in enumerate(self.classes):
                prior = self.class_prior[i]
                likelihood = np.prod(self.word_likelihoods[i, doc.nonzero()[0]]) # ignoring zero counts
                posterior = prior * likelihood
                posteriors.append(posterior)
            predicted_class = self.classes[np.argmax(posteriors)]
            predictions.append(predicted_class)
        return predictions
    
    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.mean(y_pred == y_test)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = NaiveBayesClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [54]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = NaiveBayesClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [55]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
model = NaiveBayesClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5