# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [31]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [32]:
d1 = "Artificial Intelligence is transforming industries globally."
d2 = "Natural Language Processing is a subset of AI."
d3 = "Machine Learning algorithms are key to AI applications."

documents = [d1, d2, d3]

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Tokenization

In [33]:
from nltk.tokenize import word_tokenize

# Tokenization
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]
print("Tokenized Documents:")
print(tokenized_documents)

Tokenized Documents:
[['artificial', 'intelligence', 'is', 'transforming', 'industries', 'globally', '.'], ['natural', 'language', 'processing', 'is', 'a', 'subset', 'of', 'ai', '.'], ['machine', 'learning', 'algorithms', 'are', 'key', 'to', 'ai', 'applications', '.']]


### Stopword removal

In [34]:
from nltk.corpus import stopwords

# Stopword removal
stop_words = set(stopwords.words('english'))
filtered_documents = [
    [word for word in tokens if word.isalnum() and word not in stop_words]
    for tokens in tokenized_documents
]
print("Documents after Stopword Removal:")
print(filtered_documents)

Documents after Stopword Removal:
[['artificial', 'intelligence', 'transforming', 'industries', 'globally'], ['natural', 'language', 'processing', 'subset', 'ai'], ['machine', 'learning', 'algorithms', 'key', 'ai', 'applications']]


### Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_documents = [
    [lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_documents
]
print("Lemmatized Documents:")
print(lemmatized_documents)

Lemmatized Documents:
[['artificial', 'intelligence', 'transforming', 'industry', 'globally'], ['natural', 'language', 'processing', 'subset', 'ai'], ['machine', 'learning', 'algorithm', 'key', 'ai', 'application']]


### Combine tokens back into sentences

In [36]:
# Combine tokens back into sentences
preprocessed_documents = [" ".join(tokens) for tokens in lemmatized_documents]
print("Preprocessed Documents:")
print(preprocessed_documents)

Preprocessed Documents:
['artificial intelligence transforming industry globally', 'natural language processing subset ai', 'machine learning algorithm key ai application']


## Redefine the text corpus (pre-processed)

In [37]:
corpus = preprocessed_documents

## Document-term matrix with ngram_range=(1,1)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

# Unigram Document-Term Matrix
vectorizer_unigram = CountVectorizer(ngram_range=(1,1))
dtm_unigram = vectorizer_unigram.fit_transform(corpus)

print("Unigram Features:")
print(vectorizer_unigram.get_feature_names_out())
print("Unigram Matrix:")
print(dtm_unigram.toarray())


Unigram Features:
['ai' 'algorithm' 'application' 'artificial' 'globally' 'industry'
 'intelligence' 'key' 'language' 'learning' 'machine' 'natural'
 'processing' 'subset' 'transforming']
Unigram Matrix:
[[0 0 0 1 1 1 1 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 1 0 0 1 1 1 0]
 [1 1 1 0 0 0 0 1 0 1 1 0 0 0 0]]


## Document-term matrix with ngram_range=(2,2)

In [39]:
# Bigram Document-Term Matrix
vectorizer_bigram = CountVectorizer(ngram_range=(2,2))
dtm_bigram = vectorizer_bigram.fit_transform(corpus)

print("Bigram Features:")
print(vectorizer_bigram.get_feature_names_out())
print("Bigram Matrix:")
print(dtm_bigram.toarray())


Bigram Features:
['ai application' 'algorithm key' 'artificial intelligence'
 'industry globally' 'intelligence transforming' 'key ai'
 'language processing' 'learning algorithm' 'machine learning'
 'natural language' 'processing subset' 'subset ai'
 'transforming industry']
Bigram Matrix:
[[0 0 1 1 1 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 1 1 1 0]
 [1 1 0 0 0 1 0 1 1 0 0 0 0]]


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [40]:
# Term Frequency (TF) Matrix
tf_vectorizer = CountVectorizer()
tf_matrix = tf_vectorizer.fit_transform(corpus)

print("TF Features:")
print(tf_vectorizer.get_feature_names_out())
print("TF Matrix:")
print(tf_matrix.toarray())

TF Features:
['ai' 'algorithm' 'application' 'artificial' 'globally' 'industry'
 'intelligence' 'key' 'language' 'learning' 'machine' 'natural'
 'processing' 'subset' 'transforming']
TF Matrix:
[[0 0 0 1 1 1 1 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 1 0 0 1 1 1 0]
 [1 1 1 0 0 0 0 1 0 1 1 0 0 0 0]]


### Inverse Document Frequency (IDF)

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inverse Document Frequency (IDF)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
idf_matrix = tfidf_vectorizer.idf_

print("IDF Features:")
print(tfidf_vectorizer.get_feature_names_out())
print("IDF Matrix:")
print(idf_matrix)

IDF Features:
['ai' 'algorithm' 'application' 'artificial' 'globally' 'industry'
 'intelligence' 'key' 'language' 'learning' 'machine' 'natural'
 'processing' 'subset' 'transforming']
IDF Matrix:
[1.28768207 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718]


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [42]:
# TF-IDF Matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

TF-IDF Matrix:
[[0.         0.         0.         0.4472136  0.4472136  0.4472136
  0.4472136  0.         0.         0.         0.         0.
  0.         0.         0.4472136 ]
 [0.35543247 0.         0.         0.         0.         0.
  0.         0.         0.46735098 0.         0.         0.46735098
  0.46735098 0.46735098 0.        ]
 [0.32200242 0.42339448 0.42339448 0.         0.         0.
  0.         0.42339448 0.         0.42339448 0.42339448 0.
  0.         0.         0.        ]]


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [45]:
import nltk
nltk.download('averaged_perceptron_tagger')

# Define your text
text = "Artificial intelligence is transforming industries and improving efficiency."

# Tokenization
tokens = nltk.word_tokenize(text)
print("Tokens:", tokens)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)


Tokens: ['Artificial', 'intelligence', 'is', 'transforming', 'industries', 'and', 'improving', 'efficiency', '.']
POS Tags: [('Artificial', 'JJ'), ('intelligence', 'NN'), ('is', 'VBZ'), ('transforming', 'VBG'), ('industries', 'NNS'), ('and', 'CC'), ('improving', 'VBG'), ('efficiency', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Explanation of Five POS Tags

1. **NN**: Noun, singular. A word representing a singular noun.
2. **JJ**: Adjective. Describes a property or quality of a noun.
3. **VBZ**: Verb, 3rd person singular present. A verb in the present tense, third person singular.
4. **VBG**: Verb, gerund or present participle. A verb in the gerund or present participle form.
5. **CC**: Coordinating conjunction. Connects words, phrases, or clauses that are grammatically equivalent.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [44]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-12-15 16:44:00
Python Version: 3.11.10
-----------------------------------
