## Import necessary libraries

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Define sample text

In [None]:
text = "The quick brown fox jumps over the lazy dog."

## Tokenize text using split by space

In [None]:
tokens = text.split(" ")
print("Tokens:", tokens)

## Tokenize text using regular expression

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

## Perform stemming on tokens

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed Tokens:", stemmed_tokens)

## Perform lemmatization on tokens

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("lemmatized Tokens:", lemmatized_tokens)

## Remove stop words from tokens

In [None]:
# download stopwords
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token not in stop_words]
print("Filtered Tokens:", filtered_tokens)

## Perform part-of-speech tagging on tokens

In [None]:
# Averaged perceptron is a part-of-speech tagging algorithm, 
# used to assign parts of speech to each word in a sentence.
nltk.download('averaged_perceptron_tagger')

In [None]:
tagged_tokens = pos_tag(tokens)
print("Tagged Tokens:", tagged_tokens)

## Embedding

In [None]:
!pip install -U scikit-learn

In [None]:
import gensim
import gensim.downloader as api
import numpy as np

api.info()['models'].keys()

In [None]:
# load model
model = api.load('glove-twitter-25')

In [None]:
# Apply the model to a corpus of text
words = ["new-york", "paris", "dog", "cat", "tapas", "pizza", "science", "book",
         "maths", "theory", "night", "day", "hour", "word", "sentence", "wold", "above", "under",
        "europe", "africa", "france", "italy", "india", "happy", "sad"]
vectors = [model[word] for word in words]

In [None]:
######## Can you check the vector dimension ? ############

In [None]:
!pip install -U scikit-learn

In [None]:
import numpy as np

In [None]:
# Visualize the vectors using matplotlib
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=20)
vectors_2d = tsne.fit_transform(np.array(vectors))

In [None]:
fig, ax = plt.subplots( figsize=(15, 8))
ax.scatter(vectors_2d[:, 0], vectors_2d[:, 1])
for i, txt in enumerate(words):
    ax.annotate(txt, (vectors_2d[:, 0][i], vectors_2d[:, 1][i]), fontsize=15)
plt.show()