<a href="https://github.com/Goodnight77/GenerativeAI_bootcamp/blob/main/NLP_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Unicode Nomalization
text = "NLP workshop   ????"
print(text.encode('utf-8'))

text1 = '自然言語処理ワークショップ ????'
print(text1.encode('utf-8'))

b'NLP workshop   ????'
b'\xe8\x87\xaa\xe7\x84\xb6\xe8\xa8\x80\xe8\xaa\x9e\xe5\x87\xa6\xe7\x90\x86\xe3\x83\xaf\xe3\x83\xbc\xe3\x82\xaf\xe3\x82\xb7\xe3\x83\xa7\xe3\x83\x83\xe3\x83\x97 ????'


###  Text Cleaning :

In [None]:
import re
text = """<gdg>
#GDG is a community
url <https://www.gdgcarthage.org/>,
email <gdg.carthage@gmail.com>
"""
def clean_text(text):
    # remove HTML TAG
    html = re.compile('[<,#*?>]')
    text = html.sub(r'',text)
    # Remove urls:
    url = re.compile('https?://\S+|www\.S+')
    text = url.sub(r'',text)
    # Remove email id:
    email = re.compile('[A-Za-z0-2]+@[\w]+.[\w]+')
    text = email.sub(r'',text)
    return text
print(clean_text(text))

gdg
GDG is a community
url 
email gdg.



###  Text Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

####

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import string

# sample text to be preprocessed
text = """GDG Carthage is an independent group; our activities and the opinions expressed
          here should in no way be linked to Google,
          the corporation. To learn more about the GDG program,
          visit https://developers.google.com/community/gdg/"""

# tokenize the text
tokens = word_tokenize(text)

# remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# perform stemming and lemmatization
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# remove digits and punctuation
cleaned_tokens = [token for token in lemmatized_tokens
				if not token.isdigit() and not token in string.punctuation]

# convert all tokens to lowercase
lowercase_tokens = [token.lower() for token in cleaned_tokens]

# perform part-of-speech (POS) tagging
pos_tags = pos_tag(lowercase_tokens)

# perform named entity recognition (NER)
named_entities = ne_chunk(pos_tags)

# print the preprocessed text
print("Original text:", text)
print("Preprocessed tokens:", lowercase_tokens)
print("POS tags:", pos_tags)
print("Named entities:", named_entities)


Original text: GDG Carthage is an independent group; our activities and the opinions expressed
          here should in no way be linked to Google,
          the corporation. To learn more about the GDG program,
          visit https://developers.google.com/community/gdg/
Preprocessed tokens: ['gdg', 'carthage', 'independent', 'group', 'activity', 'opinion', 'expressed', 'way', 'linked', 'google', 'corporation', 'learn', 'gdg', 'program', 'visit', 'http', '//developers.google.com/community/gdg/']
POS tags: [('gdg', 'JJ'), ('carthage', 'NN'), ('independent', 'JJ'), ('group', 'NN'), ('activity', 'NN'), ('opinion', 'NN'), ('expressed', 'VBD'), ('way', 'NN'), ('linked', 'VBN'), ('google', 'JJ'), ('corporation', 'NN'), ('learn', 'NN'), ('gdg', 'NN'), ('program', 'NN'), ('visit', 'NN'), ('http', 'NN'), ('//developers.google.com/community/gdg/', 'NN')]
Named entities: (S
  gdg/JJ
  carthage/NN
  independent/JJ
  group/NN
  activity/NN
  opinion/NN
  expressed/VBD
  way/NN
  linked/VBN
  googl

### Feature *Engineering*

#### One Hot Encoder

In [None]:
import nltk
# nltk.download('punkt') # Download 'punkt'
# from nltk if it's not downloaded
from nltk.tokenize import sent_tokenize
Text = """Google developer group.
		google Learning Together.
		today NLP workshop.
		Learning NLP techniques"""
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Tokenized Sentences :', sentences)

# Create the vocabulary
vocab = {}
count = 0
for sent in sentences:
	for word in sent.split():
		if word not in vocab:
			count = count + 1
			vocab[word] = count
print('vocabulary :', vocab)

# One Hot Encoding
def OneHotEncoder(text):
	onehot_encoded = []
	for word in text.split():
		temp = [0]*len(vocab)
		if word in vocab:
			temp[vocab[word]-1] = 1
			onehot_encoded.append(temp)
	return onehot_encoded


# print('\n',sentences[0])
print('OneHotEncoded vector for sentence : "',
	sentences[0], '"is \n', OneHotEncoder(sentences[0]))


Tokenized Sentences : ['google developer group', 'google learning together', 'today nlp workshop', 'learning nlp techniques']
vocabulary : {'google': 1, 'developer': 2, 'group': 3, 'learning': 4, 'together': 5, 'today': 6, 'nlp': 7, 'workshop': 8, 'techniques': 9}
OneHotEncoded vector for sentence : " google developer group "is 
 [[1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]]


#### Bag of Word(Bow)

In [None]:
import nltk
#nltk.download('punkt') # Download 'punkt' from nltk if it's not downloaded
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
Text = Text
# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".","") for sent in sentences]
print('Our Corpus:',sentences)
#CountVectorizer : Convert a collection of text documents to a matrix of token counts.
count_vect = CountVectorizer()
# fit & transform will represent each sentences as BOW representation
BOW = count_vect.fit_transform(sentences)
# Get the vocabulary
print("Our vocabulary: ", count_vect.vocabulary_)
#see the BOW representation
print(f"BoW representation for {sentences[0]} {BOW[0].toarray()}")
print(f"BoW representation for {sentences[1]} {BOW[1].toarray()}")
print(f"BoW representation for {sentences[2]} {BOW[2].toarray()}")
# BOW representation for a new text
BOW_ = count_vect.transform(["learning what is nlp "])
print("Bow representation for 'learning what is nlp ':", BOW_.toarray())


Our Corpus: ['google developer group', 'google learning together', 'today nlp workshop', 'learning nlp techniques']
Our vocabulary:  {'google': 1, 'developer': 0, 'group': 2, 'learning': 3, 'together': 7, 'today': 6, 'nlp': 4, 'workshop': 8, 'techniques': 5}
BoW representation for google developer group [[1 1 1 0 0 0 0 0 0]]
BoW representation for google learning together [[0 1 0 1 0 0 0 1 0]]
BoW representation for today nlp workshop [[0 0 0 0 1 0 1 0 1]]
Bow representation for 'learning what is nlp ': [[0 0 0 1 1 0 0 0 0]]


#### Bag of n-grams

In [None]:
import nltk
# nltk.download('punkt') # Download 'punkt'
# from nltk if it's not downloaded
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

Text = """Google developer group.
		google Learning Together.
		today NLP workshop.
		Learning NLP techniques"""

# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Our Corpus:', sentences)

# Ngram vectorization example with count
# vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1, 3))

# fit & transform will represent each sentences
# as Bag of n-grams representation
BOW_nGram = count_vect.fit_transform(sentences)

# Get the vocabulary
print("Our vocabulary:\n", count_vect.vocabulary_)

# see the Bag of n-grams representation
print('Ngram representation for "{}" is {}'
	.format(sentences[0], BOW_nGram[0].toarray()))
print('Ngram representation for "{}" is {}'
	.format(sentences[1], BOW_nGram[1].toarray()))
print('Ngram representation for "{}" is {}'.
	format(sentences[2], BOW_nGram[2].toarray()))

# Bag of n-grams representation for a new text
BOW_nGram_ = count_vect.transform(["learning dsa from geeksforgeeks together"])
print("Ngram representation for 'learning dsa from geeksforgeeks together' is",
	BOW_nGram_.toarray())


Our Corpus: ['google developer group', 'google learning together', 'today nlp workshop', 'learning nlp techniques']
Our vocabulary:
 {'google': 2, 'developer': 0, 'group': 7, 'google developer': 3, 'developer group': 1, 'google developer group': 4, 'learning': 8, 'together': 19, 'google learning': 5, 'learning together': 11, 'google learning together': 6, 'today': 16, 'nlp': 12, 'workshop': 20, 'today nlp': 17, 'nlp workshop': 14, 'today nlp workshop': 18, 'techniques': 15, 'learning nlp': 9, 'nlp techniques': 13, 'learning nlp techniques': 10}
Ngram representation for "google developer group" is [[1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Ngram representation for "google learning together" is [[0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0]]
Ngram representation for "today nlp workshop" is [[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 1]]
Ngram representation for 'learning dsa from geeksforgeeks together' is [[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0]]


#### TF-IDF

In [None]:
import nltk
# nltk.download('punkt') # Download 'punkt'
# from nltk if it's not downloaded
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

Text = """Google developer group.
		google Learning Together.
		today NLP workshop.
		Learning NLP techniques"""

# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Our Corpus:', sentences)

# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(sentences)

# All words in the vocabulary.
print("vocabulary", tfidf.get_feature_names_out())
# IDF value for all words in the vocabulary
print("IDF for all words in the vocabulary :\n", tfidf.idf_)

# TFIDF representation for all documents in our corpus
print('\nTFIDF representation for "{}" is \n{}'
	.format(sentences[0], tfidf_matrix[0].toarray()))
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[1], tfidf_matrix[1].toarray()))
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[2],tfidf_matrix[2].toarray()))

# TFIDF representation for a new text
matrix = tfidf.transform(["learning dsa from geeksforgeeks"])
print("\nTFIDF representation for 'learning dsa from geeksforgeeks' is\n",
	matrix.toarray())


Our Corpus: ['google developer group', 'google learning together', 'today nlp workshop', 'learning nlp techniques']
vocabulary ['developer' 'google' 'group' 'learning' 'nlp' 'techniques' 'today'
 'together' 'workshop']
IDF for all words in the vocabulary :
 [1.91629073 1.51082562 1.91629073 1.51082562 1.51082562 1.91629073
 1.91629073 1.91629073 1.91629073]

TFIDF representation for "google developer group" is 
[[0.61761437 0.48693426 0.61761437 0.         0.         0.
  0.         0.         0.        ]]
TFIDF representation for "google learning together" is 
[[0.         0.52640543 0.         0.52640543 0.         0.
  0.         0.66767854 0.        ]]
TFIDF representation for "today nlp workshop" is 
[[0.         0.         0.         0.         0.48693426 0.
  0.61761437 0.         0.61761437]]

TFIDF representation for 'learning dsa from geeksforgeeks' is
 [[0. 0. 0. 1. 0. 0. 0. 0. 0.]]


#### Neural Approach (Word embedding)

### Pre-Trained Word Embeddings

##### Word2vec by Google

In [None]:
import gensim.downloader as api

# load the pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

# define word pairs to compute similarity for
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# compute similarity for each pair of words
for pair in word_pairs:
	similarity = model.similarity(pair[0], pair[1])
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using Word2Vec: {similarity:.3f}")


Similarity between 'learn' and 'learning' using Word2Vec: 0.637
Similarity between 'india' and 'indian' using Word2Vec: 0.697
Similarity between 'fame' and 'famous' using Word2Vec: 0.326


##### GloVe by Stanford

In [None]:
!pip uninstall torchtext -y
!pip install torchdata --extra-index-url https://download.pytorch.org/whl/cu118
!pip install torchtext -q

Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
Collecting torchdata
  Downloading torchdata-0.8.0-cp310-cp310-manylinux1_x86_64.whl.metadata (5.4 kB)
Downloading torchdata-0.8.0-cp310-cp310-manylinux1_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata
Successfully installed torchdata-0.8.0


In [None]:
import torch
import torchtext.vocab as vocab

# load the pre-trained GloVe model
glove = vocab.GloVe(name='840B', dim=300)

# define word pairs to compute similarity for
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# compute similarity for each pair of words
for pair in word_pairs:
	vec1, vec2 = glove[pair[0]], glove[pair[1]]
	similarity = torch.dot(vec1, vec2) / (torch.norm(vec1) * torch.norm(vec2))
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using GloVe: {similarity:.3f}")


OSError: /usr/local/lib/python3.10/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

##### fasttext by Facebook

In [None]:
import gensim.downloader as api

# load the pre-trained fastText model
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

# define word pairs to compute similarity for
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# compute similarity for each pair of words
for pair in word_pairs:
	similarity = fasttext_model.similarity(pair[0], pair[1])
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using Word2Vec: {similarity:.3f}")
