In [6]:
# Sample paragraph
paragraph = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence. It involves the interaction between computers and humans using natural language."

# Importing necessary libraries
import nltk
import spacy
from gensim.utils import simple_preprocess
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# NLTK Preprocessing
def preprocess_nltk(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    # Removing stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

# spaCy Preprocessing
def preprocess_spacy(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    # Tokenization, lowercasing, and removing stopwords
    tokens = [token.text.lower() for token in doc if not token.is_stop]
    # Lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return tokens

# Gensim Preprocessing
def preprocess_gensim(text):
    # Simple preprocessing with Gensim
    tokens = simple_preprocess(text)
    return tokens

# Hugging Face's Transformers Preprocessing
def preprocess_transformers(text):
    # Using a smaller pre-trained model for tokenization
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    tokens = tokenizer.tokenize(text)
    return tokens

# scikit-learn Preprocessing
def preprocess_sklearn(text):
    # Using CountVectorizer and TfidfVectorizer for feature extraction
    count_vectorizer = CountVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    count_tokens = count_vectorizer.fit_transform([text]).toarray()
    tfidf_tokens = tfidf_vectorizer.fit_transform([text]).toarray()
    return count_tokens, tfidf_tokens

# Applying preprocessing
nltk_tokens = preprocess_nltk(paragraph)
spacy_tokens = preprocess_spacy(paragraph)
gensim_tokens = preprocess_gensim(paragraph)
transformers_tokens = preprocess_transformers(paragraph)
count_tokens, tfidf_tokens = preprocess_sklearn(paragraph)

# Displaying results
print("NLTK Tokens:", nltk_tokens,"\n")
print("spaCy Tokens:", spacy_tokens,"\n")
print("Gensim Tokens:", gensim_tokens,"\n")
print("Transformers Tokens:", transformers_tokens,"\n")
print("CountVectorizer Tokens:", count_tokens,"\n")
print("TfidfVectorizer Tokens:", tfidf_tokens,"\n")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\suzan.awinat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suzan.awinat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NLTK Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'artifici', 'intellig', '.', 'involv', 'interact', 'comput', 'human', 'use', 'natur', 'languag', '.'] 

spaCy Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '.', 'involve', 'interaction', 'computer', 'human', 'natural', 'language', '.'] 

Gensim Tokens: ['natural', 'language', 'processing', 'nlp', 'is', 'fascinating', 'field', 'of', 'artificial', 'intelligence', 'it', 'involves', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language'] 

Transformers Tokens: ['natural', 'language', 'processing', '(', 'nl', '##p', ')', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', '.', 'it', 'involves', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.'] 

CountVectorizer Tokens: [[1 1 1 1 1 1 1 1 2 2 1 1 1]] 

TfidfVectorizer Tokens: [[0.2294157