In [41]:
!pip install wikipedia-api


Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=be25afccd44e2b159dc3795a9d88fa39ae67c7b313dcab8111328a905ca0d9bd
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [60]:
import wikipediaapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk import classify
from nltk import NaiveBayesClassifier



In [61]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define lists of terms
location_terms = ["region", "area", "terrain", "nation", "town"]
non_location_terms = ["innovation", "knowledge", "machine learning", "computing"]

def fetch_wikipedia_content(topic, relevant_terms):
    """
    Fetch Wikipedia content and find relevant terms within it.
    """
    wiki_api = wikipediaapi.Wikipedia('en',
                                    extract_format=wikipediaapi.ExtractFormat.WIKI,
                                    headers={'User-Agent': 'Farid_Tavakkolinia'})

    wiki_page = wiki_api.page(topic)

    if wiki_page.exists():
        content = wiki_page.text
        found_terms = [term for term in relevant_terms if term.lower() in content.lower()]
        return content, found_terms
    else:
        print(f"Wikipedia page for '{topic}' does not exist.")
        return None, None




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
topic = "Machine learning"
content, found_terms = fetch_wikipedia_content(topic, non_location_terms)

if content:
    print(f"Content for '{topic}':")
    print(content[:500])  # Display the first 500 characters of the content
    print("Found Terms:", found_terms)

def preprocess_text(text, terms, stopwords_set=None, stemmer=None, lemmatizer=None):
    """
    Preprocess text with optional stopwords, stemming, and lemmatization.
    """
    # Use provided stopwords set or default to NLTK stop words
    stopwords_set = stopwords_set or set(stopwords.words('english'))

    # Use provided stemmer/lemmatizer or default ones
    stemmer = stemmer or PorterStemmer()
    lemmatizer = lemmatizer or WordNetLemmatizer()

    # Tokenize and process
    tokens = word_tokenize(text.lower())

    # Remove punctuation and numbers
    tokens = [token for token in tokens if token.isalpha()]

    # Remove very short words 
    tokens = [token for token in tokens if len(token) > 2]

    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stopwords_set]

    # Apply stemming
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # Remove terms from the feature set to avoid direct matching
    processed_text = ' '.join(stemmed_tokens)

    return processed_text



Content for 'Machine learning':
Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Within a subdiscipline in machine learning, advances in the field of deep learning have allowed neural networks, a class of statistical algorithms, to surpass many previous machine learning approaches in performance.
ML finds application in many fields, 
Found Terms: ['knowledge', 'machine learning', 'computing']


In [None]:
# Use Bag of Words without preprocessing
bow_text = content.lower()
bow_features = {word: True for word in word_tokenize(bow_text)}

# Use Snowball stemmer and custom stopwords
snowball_stemmer = SnowballStemmer('english')
processed_text_with_snowball = preprocess_text(content, found_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer)

# Use WordNet lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
processed_text_with_lemmatizer = preprocess_text(content, found_terms, stopwords_set=set(stopwords.words('english')), lemmatizer=wordnet_lemmatizer)


print("\nBag of Words without preprocessing:")
print(bow_features)

print("\nProcessed Text with Snowball stopwords and stemmer:")
print(processed_text_with_snowball[:500])

print("\nProcessed Text with WordNet Lemmatizer:")
print(processed_text_with_lemmatizer[:500])




Bag of Words without preprocessing:
{'machine': True, 'learning': True, '(': True, 'ml': True, ')': True, 'is': True, 'a': True, 'field': True, 'of': True, 'study': True, 'in': True, 'artificial': True, 'intelligence': True, 'concerned': True, 'with': True, 'the': True, 'development': True, 'and': True, 'statistical': True, 'algorithms': True, 'that': True, 'can': True, 'learn': True, 'from': True, 'data': True, 'generalize': True, 'to': True, 'unseen': True, ',': True, 'thus': True, 'perform': True, 'tasks': True, 'without': True, 'explicit': True, 'instructions': True, '.': True, 'within': True, 'subdiscipline': True, 'advances': True, 'deep': True, 'have': True, 'allowed': True, 'neural': True, 'networks': True, 'class': True, 'surpass': True, 'many': True, 'previous': True, 'approaches': True, 'performance': True, 'finds': True, 'application': True, 'fields': True, 'including': True, 'natural': True, 'language': True, 'processing': True, 'computer': True, 'vision': True, 'speech':

In [None]:
"""## Naive Bayes without Pre-processing:"""

# 'geographic' and 'non-geographic' 
geographic_class = "location"
non_geographic_class = "non-location"

# Training data for Naive Bayes on Bag of Words without pre-processing
training_data_bow_raw = [
    (fetch_wikipedia_content("Florence", location_terms)[0].lower(), geographic_class),
    (fetch_wikipedia_content("Machine learning", non_location_terms)[0].lower(), non_geographic_class),
]

# Tokenizer function for raw text
def tokenize_raw_text_bow(text):
    return word_tokenize(text)

# Feature extraction function for raw text
def extract_features_raw_bow(text):
    return {word: True for word in tokenize_raw_text_bow(text)}

# Prepare the training set without pre-processing for Bag of Words
training_set_bow_raw = [(extract_features_raw_bow(text), label) for (text, label) in training_data_bow_raw]

# Train the Naive Bayes classifier without pre-processing for Bag of Words
nb_classifier_bow_raw = NaiveBayesClassifier.train(training_set_bow_raw)

# Usage without pre-processing for Bag of Words
test_text_bow_raw = fetch_wikipedia_content("Florence", location_terms)[0].lower()
test_features_bow_raw = extract_features_raw_bow(test_text_bow_raw)
classification_bow_raw = nb_classifier_bow_raw.classify(test_features_bow_raw)

print(f"\nPredicted class for the test text without pre-processing for Bag of Words: {classification_bow_raw}")

test_text_bow_raw = fetch_wikipedia_content("Data Science", non_location_terms)[0].lower()
test_features_bow_raw = extract_features_raw_bow(test_text_bow_raw)
classification_bow_raw = nb_classifier_bow_raw.classify(test_features_bow_raw)

print(f"Predicted class for the test text without pre-processing for Bag of Words: {classification_bow_raw}")

"""## Naive Bayes with Pre-processing:

"""

# Training data for Naive Bayes with Snowball stop words and Snowball Stemmer
training_data_bow_snowball = [
    (preprocess_text(fetch_wikipedia_content("Florence", location_terms)[0], location_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer).lower(), geographic_class),
    (preprocess_text(fetch_wikipedia_content("Machine learning", non_location_terms)[0], non_location_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer).lower(),
                                non_geographic_class),
]

# Tokenizer function for preprocessed text
def tokenize_preprocessed_text_bow(text):
    return word_tokenize(text)

# Feature extraction function for preprocessed text
def extract_features_preprocessed_bow(text):
    return {word: True for word in tokenize_preprocessed_text_bow(text)}

# Prepare the training set with Snowball stop words and Snowball Stemmer for Bag of Words
training_set_bow_snowball = [(extract_features_preprocessed_bow(text), label) for (text, label) in training_data_bow_snowball]

# Train the Naive Bayes classifier with Snowball stop words and Snowball Stemmer for Bag of Words
nb_classifier_bow_snowball = NaiveBayesClassifier.train(training_set_bow_snowball)

# Usage with Snowball stop words and Snowball Stemmer for Bag of Words
test_text_bow_snowball = preprocess_text(fetch_wikipedia_content("Florence", location_terms)[0], location_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer).lower()
test_features_bow_snowball = extract_features_preprocessed_bow(test_text_bow_snowball)
classification_bow_snowball = nb_classifier_bow_snowball.classify(test_features_bow_snowball)

print(f"\nPredicted class for the preprocessed test text with Snowball stop words and Snowball Stemmer for Bag of Words: {classification_bow_snowball}")

test_text_bow_snowball = preprocess_text(fetch_wikipedia_content("Data Science", non_location_terms)[0], non_location_terms, stopwords_set=set(stopwords.words('english')),
                                                 stemmer=snowball_stemmer).lower()
test_features_bow_snowball = extract_features_preprocessed_bow(test_text_bow_snowball)
classification_bow_snowball = nb_classifier_bow_snowball.classify(test_features_bow_snowball)

print(f"Predicted class for the preprocessed test text with Snowball stop words and Snowball Stemmer for Bag of Words: {classification_bow_snowball}")




Predicted class for the test text without pre-processing for Bag of Words: location
Predicted class for the test text without pre-processing for Bag of Words: non-location

Predicted class for the preprocessed test text with Snowball stop words and Snowball Stemmer for Bag of Words: location
Predicted class for the preprocessed test text with Snowball stop words and Snowball Stemmer for Bag of Words: non-location


In [None]:
"""## Logistic Regression

### Training data for Logistic Regression with pre-processing from Naive Bayes
"""



logistic_train_data_preprocessed = [
    (preprocess_text(fetch_wikipedia_content("Florence", location_terms)[0], location_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer).lower(),
                            geographic_class),
    (preprocess_text(fetch_wikipedia_content("Machine learning", non_location_terms)[0], non_location_terms, stopwords_set=set(stopwords.words('english')), stemmer=snowball_stemmer).lower(),
                            non_geographic_class),
]

# Tokenizer function for preprocessed text 
def tokenize_preprocessed_logistic_text(text):
    return word_tokenize(text)

# Feature extraction function for preprocessed text 
def extract_preprocessed_logistic_features(text):
    return ' '.join(tokenize_preprocessed_logistic_text(text))

# Prepare the training set for Logistic Regression with pre-processing
logistic_train_set_preprocessed = [(extract_preprocessed_logistic_features(text), label) for (text, label) in logistic_train_data_preprocessed]

# Create TF-IDF vectors from the training set for Logistic Regression with pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Separate features (X) and labels (y) for Logistic Regression with pre-processing
X_train_logistic_preprocessed = [text for (text, _) in logistic_train_set_preprocessed]
y_train_logistic_preprocessed = [label for (_, label) in logistic_train_set_preprocessed]

# Create TF-IDF vectors from the training set for Logistic Regression with pre-processing
tfidf_vectorizer_logistic_preprocessed = TfidfVectorizer()
X_train_tfidf_logistic_preprocessed = tfidf_vectorizer_logistic_preprocessed.fit_transform(X_train_logistic_preprocessed)

# Train the Logistic Regression classifier with pre-processing
logistic_regressor_preprocessed = LogisticRegression()
logistic_regressor_preprocessed.fit(X_train_tfidf_logistic_preprocessed, y_train_logistic_preprocessed)

# Usage for Logistic Regression with pre-processing
test_text_logistic_preprocessed = preprocess_text(fetch_wikipedia_content("Florence", location_terms)[0], location_terms, stopwords_set=set(stopwords.words('english')),
                                                          stemmer=snowball_stemmer).lower()
test_features_logistic_preprocessed = tfidf_vectorizer_logistic_preprocessed.transform([extract_preprocessed_logistic_features(test_text_logistic_preprocessed)])
predicted_class_logistic_preprocessed = logistic_regressor_preprocessed.predict(test_features_logistic_preprocessed)

print(f"\nPredicted class for the test text using Logistic Regression with pre-processing: {predicted_class_logistic_preprocessed[0]}")

test_text_logistic_preprocessed = preprocess_text(fetch_wikipedia_content("Data Science", non_location_terms)[0], non_location_terms, stopwords_set=set(stopwords.words('english')),
                                                          stemmer=snowball_stemmer).lower()
test_features_logistic_preprocessed = tfidf_vectorizer_logistic_preprocessed.transform([extract_preprocessed_logistic_features(test_text_logistic_preprocessed)])
predicted_class_logistic_preprocessed = logistic_regressor_preprocessed.predict(test_features_logistic_preprocessed)

print(f"Predicted class for the test text using Logistic Regression with pre-processing: {predicted_class_logistic_preprocessed[0]}")

# Training data for Logistic Regression without pre-processing
logistic_train_data_raw = [
    (fetch_wikipedia_content("Florence", location_terms)[0].lower(), geographic_class),
    (fetch_wikipedia_content("Machine learning", non_location_terms)[0].lower(), non_geographic_class),
]

# Tokenizer function for raw text
def tokenize_raw_logistic_text(text):
    return word_tokenize(text)

# Feature extraction function for raw text
def extract_raw_logistic_features(text):
    return ' '.join(tokenize_raw_logistic_text(text))

# Prepare the training set for Logistic Regression without pre-processing
logistic_train_set_raw = [(extract_raw_logistic_features(text), label) for (text, label) in logistic_train_data_raw]

# Create TF-IDF vectors from the training set for Logistic Regression without pre-processing
X_train_logistic_raw = [text for (text, _) in logistic_train_set_raw]
y_train_logistic_raw = [label for (_, label) in logistic_train_set_raw]

tfidf_vectorizer_logistic_raw = TfidfVectorizer()
X_train_tfidf_logistic_raw = tfidf_vectorizer_logistic_raw.fit_transform(X_train_logistic_raw)

# Train the Logistic Regression classifier without pre-processing
logistic_regressor_raw = LogisticRegression()
logistic_regressor_raw.fit(X_train_tfidf_logistic_raw, y_train_logistic_raw)

"""### Usage for Logistic Regression without pre-processing

"""

test_text_logistic_raw = fetch_wikipedia_content("Florence", location_terms)[0].lower()
test_features_logistic_raw = tfidf_vectorizer_logistic_raw.transform([extract_raw_logistic_features(test_text_logistic_raw)])
predicted_class_logistic_raw = logistic_regressor_raw.predict(test_features_logistic_raw)

print(f"\nPredicted class for the test text using Logistic Regression without pre-processing: {predicted_class_logistic_raw[0]}")

test_text_logistic_raw = fetch_wikipedia_content("Data Science", non_location_terms)[0].lower()
test_features_logistic_raw = tfidf_vectorizer_logistic_raw.transform([extract_raw_logistic_features(test_text_logistic_raw)])
predicted_class_logistic_raw = logistic_regressor_raw.predict(test_features_logistic_raw)

print(f"Predicted class for the test text using Logistic Regression without pre-processing: {predicted_class_logistic_raw[0]}")



Predicted class for the test text using Logistic Regression with pre-processing: location
Predicted class for the test text using Logistic Regression with pre-processing: non-location

Predicted class for the test text using Logistic Regression without pre-processing: location
Predicted class for the test text using Logistic Regression without pre-processing: non-location


In [None]:

class TextClassifier:
    """
    A class to handle text classification with different models and preprocessing
    """
    def __init__(self, location_terms, non_location_terms):
        self.location_terms = location_terms
        self.non_location_terms = non_location_terms
        self.geographic_class = "location"
        self.non_geographic_class = "non-location"

    def prepare_training_data(self, use_preprocessing=True):
        """Prepare training data with or without preprocessing"""
        training_examples = [
            ("Florence", self.location_terms, self.geographic_class),
            ("Paris", self.location_terms, self.geographic_class),
            ("London", self.location_terms, self.geographic_class),
            ("Machine Learning", self.non_location_terms, self.non_geographic_class),
            ("Data Science", self.non_location_terms, self.non_geographic_class),
            ("Artificial Intelligence", self.non_location_terms, self.non_geographic_class)
        ]

        results = []
        for topic, terms, label in training_examples:
            content = fetch_wikipedia_content(topic, terms)[0]
            if content:
                if use_preprocessing:
                    processed_text = preprocess_text(
                        content,
                        terms,
                        stemmer=SnowballStemmer('english')
                    ).lower()
                    results.append((processed_text, label))
                else:
                    results.append((content.lower(), label))
        return results

    def extract_features(self, text):
        """Extract features for Naive Bayes"""
        # Tokenize and create word frequency dictionary instead of binary features
        tokens = word_tokenize(text.lower())
        word_freq = {}
        for token in tokens:
            if token.isalpha() and len(token) > 2:  # Only consider alphabetic tokens longer than 2 chars
                word_freq[token] = word_freq.get(token, 0) + 1
        return word_freq

    def train_naive_bayes(self, use_preprocessing=True):
        """Train Naive Bayes classifier"""
        training_data = self.prepare_training_data(use_preprocessing)
        training_set = [(self.extract_features(text), label)
                       for (text, label) in training_data]
        return NaiveBayesClassifier.train(training_set)

    def train_logistic_regression(self, use_preprocessing=True):
        """Train Logistic Regression classifier with improved vectorization"""
        training_data = self.prepare_training_data(use_preprocessing)

        X_train = [text for (text, _) in training_data]
        y_train = [label for (_, label) in training_data]

        vectorizer = TfidfVectorizer(
            max_features=1000,  # Limit features to avoid overfitting
            min_df=2,          # Ignore terms that appear in less than 2 documents
            max_df=0.95,       # Ignore terms that appear in more than 95% of documents
            ngram_range=(1, 2) # Use both unigrams and bigrams
        )
        X_train_tfidf = vectorizer.fit_transform(X_train)

        
        classifier = LogisticRegression(
            C=1.0,            # Regularization strength
            class_weight='balanced', # Handle class imbalance
            random_state=42   # For reproducibility
        )
        classifier.fit(X_train_tfidf, y_train)

        return classifier, vectorizer

    def test_classifiers(self):
        """Test both classifiers on multiple examples"""
        test_cases = [
            ("Rome", self.location_terms, self.geographic_class),
            ("Tokyo", self.location_terms, self.geographic_class),
            ("Berlin", self.location_terms, self.geographic_class),
            ("Computer Vision", self.non_location_terms, self.non_geographic_class),
            ("Natural Language Processing", self.non_location_terms, self.non_geographic_class),
            ("Deep Learning", self.non_location_terms, self.non_geographic_class)
        ]

        print("\n=== Testing Naive Bayes Classifier ===")
        nb_classifier = self.train_naive_bayes(use_preprocessing=True)

        nb_correct = 0
        nb_total = 0

        for topic, terms, expected in test_cases:
            content = fetch_wikipedia_content(topic, terms)[0]
            if content:
                # Test with preprocessing
                processed_text = preprocess_text(
                    content,
                    terms,
                    stemmer=SnowballStemmer('english')
                ).lower()
                features = self.extract_features(processed_text)
                prediction = nb_classifier.classify(features)
                print(f"{topic}: {prediction}")

                # Update accuracy counters
                if prediction == expected:
                    nb_correct += 1
                nb_total += 1

        print("\n=== Testing Logistic Regression Classifier ===")
        logistic_classifier, vectorizer = self.train_logistic_regression(use_preprocessing=True)

        # Keep track of correct predictions for accuracy calculation
        lr_correct = 0
        lr_total = 0

        for topic, terms, expected in test_cases:
            # Get content and preprocess
            content = fetch_wikipedia_content(topic, terms)[0]
            if content:
                # Test with preprocessing
                processed_text = preprocess_text(
                    content,
                    terms,
                    stemmer=SnowballStemmer('english')
                ).lower()
                features = vectorizer.transform([processed_text])
                prediction = logistic_classifier.predict(features)[0]
                print(f"{topic}: {prediction}")

                # Update accuracy counters
                if prediction == expected:
                    lr_correct += 1
                lr_total += 1

        # Print classifier accuracy
        print("\nClassifier Performance:")
        nb_accuracy = nb_correct / nb_total if nb_total > 0 else 0
        lr_accuracy = lr_correct / lr_total if lr_total > 0 else 0
        print(f"Naive Bayes accuracy on test cases: {nb_accuracy:.2f}")
        print(f"Logistic Regression accuracy score: {lr_accuracy:.2f}")


classifier = TextClassifier(location_terms, non_location_terms)

# Train and test models
classifier.test_classifiers()


=== Testing Naive Bayes Classifier ===
Rome: location
Tokyo: location
Berlin: location
Computer Vision: non-location
Natural Language Processing: non-location
Deep Learning: non-location

=== Testing Logistic Regression Classifier ===
Rome: location
Tokyo: location
Berlin: location
Computer Vision: non-location
Natural Language Processing: non-location
Deep Learning: non-location

Classifier Performance:
Naive Bayes accuracy on test cases: 1.00
Logistic Regression accuracy score: 1.00
