In [1]:
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Wikipedia API endpoint
wiki_api_url = "https://en.wikipedia.org/w/api.php"

# Function to fetch Wikipedia content for a given title
def fetch_wikipedia_content(title, num_sentences=2):
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
        'exsentences': num_sentences,
    }
    response = requests.get(wiki_api_url, params=params)
    data = response.json()
    page_id = next(iter(data['query']['pages']))
    return data['query']['pages'][page_id]['extract']

# Function to pre-process text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(words)

# List of titles for geographical and non-geographical categories
geographical_titles = ["Geography", "Physical_geography", "Human_geography", "Cartography", "Climate", "Biogeography", "Tourism", "Countries_by_area"]
non_geographical_titles = ["Technology", "History_of_technology", "Engineering", "Computer_science", "Space_exploration", "Robotics", "Artificial_intelligence", "Automotive_technology"]

# Fetch content for geographical articles
geographical_texts = [fetch_wikipedia_content(title) for title in geographical_titles]

# Fetch content for non-geographical articles
non_geographical_texts = [fetch_wikipedia_content(title) for title in non_geographical_titles]

# Combine and preprocess data
all_texts = geographical_texts + non_geographical_texts
labels = ['geographical'] * len(geographical_texts) + ['non-geographical'] * len(non_geographical_texts)
preprocessed_texts = [preprocess_text(text) for text in all_texts]

# Feature extraction using TF-IDF with different parameters
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1, max_features=1000)  # Experiment with parameters
X = vectorizer.fit_transform(preprocessed_texts)
y = labels

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Naive Bayes model with different alpha value
clf = MultinomialNB(alpha=0.1)  # Experiment with alpha
clf.fit(X_train, y_train)

# Predictions for the testing data
y_pred = clf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leulfeven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leulfeven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Overall Accuracy: 1.00

Classification Report:
                  precision    recall  f1-score   support

    geographical       1.00      1.00      1.00         2
non-geographical       1.00      1.00      1.00         2

        accuracy                           1.00         4
       macro avg       1.00      1.00      1.00         4
    weighted avg       1.00      1.00      1.00         4

