<a href="https://colab.research.google.com/github/LeibGit/-DI_Bootcamp/blob/main/exercisexp_week8_day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Exercise 1***

In [None]:
# add all imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
import spacy

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("stopwords")
nltk.download("punkt_tab")

In [None]:
# intiialize sample data
data = {
    'Review': [
        'At McDonald\'s the food was ok and the service was bad.',
        'I would not recommend this Japanese restaurant to anyone.',
        'I loved this restaurant when I traveled to Thailand last summer.',
        'The menu of Loving has a wide variety of options.',
        'The staff was friendly and helpful at Google\'s employees restaurant.',
        'The ambiance at Bella Italia is amazing, and the pasta dishes are delicious.',
        'I had a terrible experience at Pizza Hut. The pizza was burnt, and the service was slow.',
        'The sushi at Sushi Express is always fresh and flavorful.',
        'The steakhouse on Main Street has a cozy atmosphere and excellent steaks.',
        'The dessert selection at Sweet Treats is to die for!'
    ]
}

In [None]:
class NlpPipeline:

  def __init__(self,text: str):
    self.text = text
    self.nlp = spacy.load("en_core_web_sm")
    self.lemmatized_text = None

  # preprocess text function
  def preprocess_text(self):

    # tokenize the lowercase, anti punctuation converted text
    tokenize = word_tokenize(
        self.text.lower().replace("?", "").replace(".", "").replace("!", "")
    )
    stop_words = set(stopwords.words('english'))

    filtered_words = [word for word in tokenize if word not in stop_words]
    nlp = spacy.load("en_core_web_sm")
    lemmatized_text = nlp(" ".join(filtered_words)) # Join the list back into a string
    print(type(lemmatized_text))
    self.lemmatized_text = lemmatized_text
    return lemmatized_text

  def show_preprocessed_res(self):
    # print('---------------', type(self.text), self.text)
    result = self.preprocess_text()
    return result

  def create_new_data(self):
    # create a new dataset with clean data
    cleaned_data = {}
    cleaned_data["Review"] = self.preprocess_text
    print(cleaned_data)

  # natural entity recognition
  def perform_ner(self):

    doc = self.nlp(self.lemmatized_text)

    for ent in doc.ents:
      print(f"Text: {ent.text} | Label: {ent.label_}")

  def perform_pos_tagging(self):
    tagged =[token.text for token in self.lemmatized_text]
    return tagged

In [None]:
for sentence in data["Review"]:
  new_instance = NlpPipeline(sentence)
  new_instance.preprocess_text()
  # The following calls might also need adjustment based on their definitions
  # and what they are supposed to do with the instance's state.
  # For now, let's just fix the immediate error.
  print(new_instance.show_preprocessed_res())
  print(new_instance.create_new_data())
  print(new_instance.perform_ner())
  print(new_instance.perform_pos_tagging())

***Exercise 2***

In [None]:
!pip install gensim

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")

In [None]:
preprocessed_sentences = [new_instance.preprocess_text()]

model = Word2Vec(sentences=preprocessed_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)
print(model)
print(model.wv.vector_size)
# indicates an 100 dimensional vector

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

def plot_word_embeddings(model_wv):
  # Get all words from the vocabulary and their corresponding vectors
  words = list(model_wv.key_to_index.keys())
  vectors = model_wv[words]

  # Reduce dimensions to 2 using PCA for visualization
  pca = PCA(n_components=2)
  result = pca.fit_transform(vectors)

  # Plot the reduced vectors
  plt.figure(figsize=(12, 10))
  plt.scatter(result[:, 0], result[:, 1])

  # Annotate each point with its word
  for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')

  plt.title("Word Embeddings (2D PCA)")
  plt.xlabel("PCA Component 1")
  plt.ylabel("PCA Component 2")
  plt.grid(True)
  plt.show()

# Call the function with the word vectors (model.wv) from the trained model
# Assuming 'model' is the Word2Vec model trained in the previous cell
plot_word_embeddings(model.wv)