In [None]:
import re                           #Imports the regular expression module, which provides tools for text pattern matching and manipulation (used here to remove punctuation).
import nltk                         #Imports the Natural Language Toolkit (NLTK) to provide tools for tokenization, lemmatization, and more.
from nltk.corpus import stopwords   #Imports the stopwords corpus from NLTK, which contains a list of common words (such as "the", "and", "in") that are often removed in text preprocessing because they do not carry significant meaning.
from nltk.tokenize import word_tokenize   #Imports the word_tokenize function from NLTK, which splits a text into individual words (tokens).
from nltk.stem import WordNetLemmatizer

In [None]:
# Download NLTK resources (run only once)
nltk.download('punkt')       #Downloads the Punkt tokenizer models, which are necessary for tokenization (splitting text into words or sentences).
nltk.download('stopwords')   #Downloads the list of stopwords in various languages, including English. This list is used to filter out common, non-informative words.
nltk.download('wordnet')     #Downloads the WordNet lexicon, a large lexical database of English, which is used for lemmatization. It helps the lemmatizer determine the base form of words.
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Example text
text = "This is an example sentence, to demonstrate text preprocessing! We'll clean it and tokenize."

#The text contains punctuation, capital letters, and stopwords, making it a good candidate for cleaning.

# Function for text preprocessing
def preprocess_text(text):           #Defines a function preprocess_text that takes a string text as input and performs several preprocessing steps (like lowercasing, tokenization, etc.).
    # 1. Lowercase the text
    text = text.lower()              #Converts the entire text to lowercase to ensure uniformity, as text processing often ignores case sensitivity. For example, "Text" and "text" should be treated as the same word.

    # 2. Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)      #Uses the re.sub function to remove punctuation and special characters. The regular expression r'[^\w\s]' matches any character that is not a word character (\w) or a whitespace (\s). This removes things like commas, periods, and exclamation marks.

    # 3. Tokenize the text
    tokens = word_tokenize(text)        #This splits the text into individual words, resulting in a list of tokens.

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))   #Loads a set of stopwords from the NLTK library for the English language.
    tokens = [word for word in tokens if word not in stop_words]   #Filters out the stopwords from the tokenized text.

    # 5. Lemmatization
    lemmatizer = WordNetLemmatizer()  #Creates an instance of the WordNetLemmatizer.
    tokens = [lemmatizer.lemmatize(word) for word in tokens]    #Applies lemmatization to each token in the list.

    return tokens      #Returns the list of preprocessed tokens (cleaned, tokenized, stopword-free, and lemmatized) as the output of the function.

In [None]:
# Process the example text
cleaned_text = preprocess_text(text)     #Calls the preprocess_text function on the example text. This applies all the preprocessing steps defined earlier, resulting in a cleaned list of tokens.
print("Cleaned Tokens:", cleaned_text)   #Prints the cleaned tokens to the console, showing the final result of the preprocessing steps.

Cleaned Tokens: ['example', 'sentence', 'demonstrate', 'text', 'preprocessing', 'well', 'clean', 'tokenize']


In [None]:
import spacy                         #This imports the spaCy library, which is an advanced NLP library for Python. It provides tools for tokenization, part-of-speech tagging, named entity recognition, dependency parsing, and more.

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")    #This loads the small English language model (en_core_web_sm) provided by spaCy. en_core_web_sm is the smallest English model, optimized for speed rather than accuracy.

# Sample text
text = "Apple is looking at buying U.K. startup for $1 billion. Steve Jobs founded Apple in 1976."

#It contains a few named entities (like "Apple" and "Steve Jobs"), as well as some financial information (like "$1 billion") and dates.

# Process the text using spaCy
doc = nlp(text)     #nlp(text) processes the input text and returns a Doc object, which is a container for the processed text.

# Part-of-Speech (POS) tagging
print("Part-of-Speech Tagging:")  #Part-of-speech (POS) tagging is the task of assigning a grammatical category (like noun, verb, adjective) to each token in the text.
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")  #token.text: This gives the original text of each token (word). token.pos_: This gives the coarse-grained part-of-speech label (e.g., NOUN, VERB, ADJ). token.tag_: This provides a more fine-grained POS tag (e.g., NN for singular noun, VBD for past-tense verb, etc.).

# Named Entity Recognition (NER)
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_} ({spacy.explain(ent.label_)})") #doc.ents: This contains a list of all the named entities detected in the processed text. ent.text: This gives the text of the named entity (e.g., "Apple", "$1 billion"). ent.label_: This gives the label for the entity (e.g., ORG for organizations, GPE for geopolitical entities, DATE for dates, MONEY for monetary values). spacy.explain(ent.label_): This provides an explanation of the entity label.


Part-of-Speech Tagging:
Apple: PROPN (NNP)
is: AUX (VBZ)
looking: VERB (VBG)
at: ADP (IN)
buying: VERB (VBG)
U.K.: PROPN (NNP)
startup: VERB (VBD)
for: ADP (IN)
$: SYM ($)
1: NUM (CD)
billion: NUM (CD)
.: PUNCT (.)
Steve: PROPN (NNP)
Jobs: PROPN (NNP)
founded: VERB (VBD)
Apple: PROPN (NNP)
in: ADP (IN)
1976: NUM (CD)
.: PUNCT (.)

Named Entities:
Apple: ORG (Companies, agencies, institutions, etc.)
U.K.: GPE (Countries, cities, states)
$1 billion: MONEY (Monetary values, including unit)
Steve Jobs: PERSON (People, including fictional)
Apple: ORG (Companies, agencies, institutions, etc.)
1976: DATE (Absolute or relative dates or periods)


In [None]:
from textblob import TextBlob

# Sample text
text = "I absolutely love this product! It's fantastic and works like a charm."

# Create a TextBlob object
blob = TextBlob(text)

# Perform sentiment analysis
sentiment = blob.sentiment

# Output sentiment polarity and subjectivity
print(f"Polarity: {sentiment.polarity}")  # Polarity: -1 (negative) to 1 (positive)
print(f"Subjectivity: {sentiment.subjectivity}")  # Subjectivity: 0 (objective) to 1 (subjective)


Polarity: 0.5125
Subjectivity: 0.75


In [None]:
!pip install gensim
import gensim                               #Imports the gensim library, which is a popular Python library for topic modeling, document similarity, and vector space modeling. It includes the LDA model, which we’ll use for topic modeling.
from gensim import corpora                  #Imports the corpora module from Gensim, which provides utilities for handling a corpus of documents. It includes methods for creating a dictionary (mapping words to unique IDs) and for creating document-term matrices.
from gensim.models import LdaModel          #Imports the LdaModel class from Gensim. This class is used for training an LDA model on a corpus to discover topics within a collection of documents.
from nltk.corpus import stopwords           #Imports the stopwords corpus from NLTK (Natural Language Toolkit), which contains a list of common words (e.g., "the", "is", "and") that are often removed from text during preprocessing.
from nltk.tokenize import word_tokenize     #Imports the word_tokenize function from NLTK, which is used for splitting a sentence into individual words or tokens.
import nltk             #Imports the main NLTK library to access other utilities like stopwords and tokenizers.
nltk.download('punkt_tab')

# Download NLTK stopwords (run only once)
nltk.download('punkt')       #Downloads the Punkt tokenizer models, which are necessary for word tokenization (splitting sentences into words).
nltk.download('stopwords')   #Downloads the stopwords list, which contains a set of common words in English that are typically removed from text before processing.

# Sample documents
documents = [
    "Artificial intelligence is transforming the technology industry.",
    "Machine learning and AI are shaping the future of automation.",
    "Deep learning algorithms are a subset of machine learning.",
    "Quantum computing will revolutionize industries like AI.",
    "Healthcare is benefiting from AI and machine learning advances.",
]

#This defines a list of sample documents (sentences) that will be used for topic modeling. These sentences are focused on topics related to artificial intelligence (AI) and machine learning (ML).

# Preprocess the documents
def preprocess(doc):                                    #Defines a function that preprocesses a document (sentence) to prepare it for modeling by removing stopwords and non-alphabetic words.
    stop_words = set(stopwords.words('english'))        #Loads the set of English stopwords from NLTK into the stop_words variable. These are words like "and", "the", "is", etc., that generally do not carry important meaning for topic modeling.
    tokens = word_tokenize(doc.lower())                 #Tokenizes the input doc (document) into individual words (tokens) and converts all words to lowercase using .lower() to ensure uniformity (e.g., "AI" and "ai" will be treated as the same).
    return [word for word in tokens if word.isalpha() and word not in stop_words]    #Filters out any tokens that are non-alphabetic (such as punctuation or numbers) and any stopwords. It returns a list of meaningful words (tokens).

processed_docs = [preprocess(doc) for doc in documents]               #Applies the preprocess() function to each document in the documents list. This results in a list of tokenized, lowercased, stopword-free words for each document.

# Create a dictionary and document-term matrix
dictionary = corpora.Dictionary(processed_docs)                          #Creates a dictionary using the processed documents. The dictionary maps each unique word (token) to a unique ID. This is an essential step before building a document-term matrix (DTM).
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_docs]    #Converts each preprocessed document into a bag-of-words representation using dictionary.doc2bow(). The doc2bow function converts each document into a list of tuples, where each tuple represents a word ID and its frequency in the document.

# Train the LDA model (specifying 2 topics)
lda_model = LdaModel(doc_term_matrix, num_topics=2, id2word=dictionary, passes=15)   #Specifies that the model should discover 2 topics from the documents. id2word=dictionary: The dictionary created earlier is passed to the model to help interpret word IDs.. passes=15: Specifies the number of passes (iterations) over the entire corpus to optimize the model. More passes generally result in better topic quality but take more time.

# Print the topics with associated words
print("Topics discovered by LDA:")
topics = lda_model.print_topics(num_words=5)    #Prints the top 5 words for each discovered topic. This allows you to understand what each topic is about based on the most common words in the topic.
for topic in topics:
    print(topic)        #Iterates over the topics and prints them out. Each topic consists of a list of words that are highly associated with that topic.

# Document similarity (clustering example)
doc1_bow = dictionary.doc2bow(preprocess("AI and machine learning are advancing rapidly"))   #Preprocesses the new document, converts it into a bag-of-words format using the dictionary, and stores it in doc1_bow.
doc2_bow = dictionary.doc2bow(preprocess("Healthcare is benefiting from AI advances"))       #Preprocesses and converts the second document into a bag-of-words representation, storing it in doc2_bow.

similarity = gensim.matutils.cossim(doc1_bow, doc2_bow)   #Computes the cosine similarity between the two document vectors (doc1_bow and doc2_bow). Cosine similarity is a measure of similarity between two vectors based on the cosine of the angle between them. A higher cosine value indicates more similarity.
print("\nDocument Similarity (cosine):", similarity)   #Prints the cosine similarity value between the two documents



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Topics discovered by LDA:
(0, '0.115*"ai" + 0.065*"computing" + 0.065*"industries" + 0.065*"quantum" + 0.065*"like"')
(1, '0.128*"learning" + 0.091*"machine" + 0.054*"industry" + 0.054*"technology" + 0.054*"deep"')

Document Similarity (cosine): 0.2886751345948129


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
