<a href="https://colab.research.google.com/github/NargesSamaeii/NLP_Assignment/blob/main/NLP_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wikipedia-api



In [None]:
import nltk
import wikipediaapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist, download
import numpy as np
import requests


In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Set up Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia('english', extract_format=wikipediaapi.ExtractFormat.WIKI, headers={'User-Agent': 'Narges'})

# Function to retrieve text from a Wikipedia page
def fetch_wikipedia_text(page_title):
    try:
        page = wiki_wiki.page(page_title)

        if not page.exists():
            print(f"Wikipedia page '{page_title}' does not exist.")
            return None

        return page.text
    except Exception as e:
        print(f"An error occurred while fetching Wikipedia page '{page_title}': {e}")
        return None



In [None]:
# Sample annotated keywords for geographical and non-geographical topics
geo_topics = ['Europe', 'Asia', 'Africa', 'North America', 'South America', 'Australia', 'Antarctica', 'Mountain', 'River', 'Desert']
non_geo_topics = ['programming', 'technology', 'history', 'medical', 'estimates', 'behave', 'physic', 'economy']

# Function to preprocess text with optional stemming and lemmatization
def preprocess_text(text, stop_words, use_stemming=False, use_lemmatization=False):
    words = word_tokenize(text)

    if use_stemming:
        stemmer = nltk.stem.SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words]

    if use_lemmatization:
        lemmatizer = nltk.stem.WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

# Function to extract keywords with optional stemming and lemmatization
def extract_keywords(text, stop_words, use_stemming=False, use_lemmatization=False):
    return preprocess_text(text, stop_words, use_stemming, use_lemmatization)

# Function to extract nouns with optional stemming and lemmatization
def extract_nouns(text, stop_words, use_stemming=False, use_lemmatization=False):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)

    if use_stemming:
        stemmer = nltk.stem.SnowballStemmer('english')
        tagged_words = [(stemmer.stem(word), pos) for word, pos in tagged_words]

    if use_lemmatization:
        lemmatizer = nltk.stem.WordNetLemmatizer()
        tagged_words = [(lemmatizer.lemmatize(word), pos) for word, pos in tagged_words]

    return [word.lower() for word, pos in tagged_words if pos.startswith('N') and word.lower() not in stop_words and word.isalnum()]

# Function to extract top nouns from topics with optional stemming and lemmatization
def find_top_nouns(topics, stop_words, num_top_nouns=10, use_stemming=False, use_lemmatization=False):
    all_nouns = []

    for topic in topics:
        text = fetch_wikipedia_text(topic)
        if text:
            all_nouns.extend(extract_nouns(text, stop_words, use_stemming, use_lemmatization))

    nouns_freq_dist = FreqDist(all_nouns)
    return [word for word, _ in nouns_freq_dist.most_common(num_top_nouns)]


In [None]:
# Extract top nouns for both geographical and non-geographical topics
stop_words = set(stopwords.words('english'))
top_geo_keywords = find_top_nouns(geo_topics, stop_words, num_top_nouns=10, use_stemming=True, use_lemmatization=True)
top_non_geo_keywords = find_top_nouns(non_geo_topics, stop_words, num_top_nouns=10, use_stemming=True, use_lemmatization=True)

all_topics = geo_topics + non_geo_topics
all_docs = []
all_labels = []

for topic in all_topics:
    text = fetch_wikipedia_text(topic)
    if text:
        keywords = extract_keywords(text, stop_words, use_stemming=True, use_lemmatization=True)
        all_docs.append(" ".join(keywords))
        all_labels.append(1 if topic in geo_topics else 0)

all_top_keywords = top_geo_keywords + top_non_geo_keywords
vectorizer = TfidfVectorizer(vocabulary=all_top_keywords)
X = vectorizer.fit_transform(all_docs)
y = all_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression classifier
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)
logistic_predictions = logistic_classifier.predict(X_test)

# Evaluate accuracy for Logistic Regression
accuracy_logistic = accuracy_score(y_test, logistic_predictions)
print(f"Accuracy (Customized Logistic Regression): {accuracy_logistic}")

subject = "Germany"
website_text = fetch_wikipedia_text(subject)

# Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
naive_bayes_predictions = naive_bayes_classifier.predict(X_test)

# Evaluate accuracy, precision, and recall for Naive Bayes
accuracy_naive_bayes = accuracy_score(y_test, naive_bayes_predictions)
precision_naive_bayes = precision_score(y_test, naive_bayes_predictions)
recall_naive_bayes = recall_score(y_test, naive_bayes_predictions)

# Print the results for Naive Bayes
print(f"Accuracy (Customized Naïve Bayes): {accuracy_naive_bayes}")
print(f"Precision (Customized Naïve Bayes): {precision_naive_bayes}")
print(f"Recall (Customized Naïve Bayes): {recall_naive_bayes}")

if website_text:
    # Vectorize the website text using TF-IDF
    website_vectorized = vectorizer.transform([" ".join(extract_keywords(website_text, stop_words, use_stemming=True, use_lemmatization=True))])

    # Naive Bayes and Logistic Regression predictions using TF-IDF for the website
    naive_bayes_website_prediction = naive_bayes_classifier.predict(website_vectorized)
    logistic_website_prediction = logistic_classifier.predict(website_vectorized)

    print(f'The content of the "{subject}" is geographical according to Customized Naïve Bayes with TF-IDF.' if naive_bayes_website_prediction == 1 else f'The content of the "{subject}" is non-geographical according to Customized Naïve Bayes with TF-IDF.')
    print(f'The content of the "{subject}" is geographical according to Customized Logistic Regression with TF-IDF.' if logistic_website_prediction == 1 else f'The content of the "{subject}" is non-geographical according to Customized Logistic Regression with TF-IDF.')
else:
    print(f'Unable to fetch text from the "{subject}". Please check the Subject or try another subject.')


Accuracy (Customized Logistic Regression): 0.5
Accuracy (Customized Naïve Bayes): 1.0
Precision (Customized Naïve Bayes): 1.0
Recall (Customized Naïve Bayes): 1.0
The content of the "Germany" is geographical according to Customized Naïve Bayes with TF-IDF.
The content of the "Germany" is non-geographical according to Customized Logistic Regression with TF-IDF.
