In [4]:
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Wikipedia API endpoint
wiki_api_url = "https://en.wikipedia.org/w/api.php"

# Function to fetch Wikipedia content for a given title
def fetch_wikipedia_content(article_title, num_sentences=2):
    params = {
        'action': 'query',
        'format': 'json',
        'titles': article_title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
        'exsentences': num_sentences,
    }
    response = requests.get(wiki_api_url, params=params)
    data = response.json()
    page_id = next(iter(data['query']['pages']))
    return data['query']['pages'][page_id]['extract']

# Function to pre-process text
def preprocess_text(text_content):
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text_content) if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(words)

# List of titles for geographic and non-geographic categories
geographic_article_titles = ["Geography", "Landforms", "Countries", "Cities", "Coordinates"]
non_geographic_article_titles = ["Technology", "History_of_technology", "Engineering", "Computer_science", "Space_exploration", "Robotics"]

# Fetch content for geographic articles
geographic_article_texts = [fetch_wikipedia_content(title) for title in geographic_article_titles]

# Fetch content for non-geographic articles
non_geographic_article_texts = [fetch_wikipedia_content(title) for title in non_geographic_article_titles]

# Combine and preprocess data
all_article_texts = geographic_article_texts + non_geographic_article_texts
article_labels = ['geographic'] * len(geographic_article_texts) + ['non-geographic'] * len(non_geographic_article_texts)
preprocessed_article_texts = [preprocess_text(text) for text in all_article_texts]

# Feature extraction using Bag-of-Words
text_vectorizer = CountVectorizer()
X_feature_matrix = text_vectorizer.fit_transform(preprocessed_article_texts)
y_labels = article_labels

# Train Logistic Regression model
classification_model = LogisticRegression()
classification_model.fit(X_feature_matrix, y_labels)

# Predictions for the entire dataset
predicted_labels = classification_model.predict(X_feature_matrix)

# Evaluation
accuracy_result = accuracy_score(y_labels, predicted_labels)
print(f"\nOverall Accuracy: {accuracy_result:.2f}")

print("\nClassification Report:")
print(classification_report(y_labels, predicted_labels))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leuls\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leuls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Overall Accuracy: 1.00

Classification Report:
                precision    recall  f1-score   support

    geographic       1.00      1.00      1.00         5
non-geographic       1.00      1.00      1.00         6

      accuracy                           1.00        11
     macro avg       1.00      1.00      1.00        11
  weighted avg       1.00      1.00      1.00        11

