<a href="https://colab.research.google.com/github/Hamsinikatla/NeuralNetwork_HW4/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#Q1: NLP Preprocessing Pipeline

# Step 1: Install NLTK
!pip install -q nltk

# Step 2: Import what we need
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Step 3: Download only stopwords (no punkt!)
nltk.download('stopwords')

# Step 4: Simple tokenizer using regex
def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text)

# Step 5: NLP pipeline
def nlp_preprocessing_pipeline(sentence):
    # 1. Tokenize
    tokens = simple_tokenize(sentence)
    print("Original Tokens:", tokens)

    # 2. Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    print("Tokens Without Stopwords:", filtered_tokens)

    # 3. Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    print("Stemmed Words:", stemmed_tokens)

# Step 6: Run it
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocessing_pipeline(sentence)


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
#Q2: Named Entity Recognition with SpaCy

# Import the necessary spaCy library
import spacy

# Load the model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."

# Process the sentence with spaCy
doc = nlp(sentence)

# Extract and print named entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")

Entity: Barack Obama, Label: PERSON, Start: 0, End: 12
Entity: 44th, Label: ORDINAL, Start: 27, End: 31
Entity: the United States, Label: GPE, Start: 45, End: 62
Entity: the Nobel Peace Prize, Label: WORK_OF_ART, Start: 71, End: 92
Entity: 2009, Label: DATE, Start: 96, End: 100


In [21]:
#Q3: Scaled Dot-Product Attention

import numpy as np

# Define the scaled dot-product attention function
def scaled_dot_product_attention(Q, K, V):
    # Step 1: Compute the dot product of Q and Kᵀ
    matmul_qk = np.dot(Q, K.T)

    # Step 2: Scale the dot product by dividing by the square root of the dimension of the key
    d_k = K.shape[-1]  # key dimension (last dimension of K)
    scaled_attention_logits = matmul_qk / np.sqrt(d_k)

    # Step 3: Apply softmax to get attention weights
    attention_weights = np.exp(scaled_attention_logits) / np.sum(np.exp(scaled_attention_logits), axis=-1, keepdims=True)

    # Step 4: Multiply the attention weights by V to get the final output
    output = np.dot(attention_weights, V)

    return attention_weights, output

# Test input matrices
Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

# Call the function and print the results
attention_weights, output = scaled_dot_product_attention(Q, K, V)

attention_weights, output


(array([[0.73105858, 0.26894142],
        [0.26894142, 0.73105858]]),
 array([[2.07576569, 3.07576569, 4.07576569, 5.07576569],
        [3.92423431, 4.92423431, 5.92423431, 6.92423431]]))

In [31]:
#Q4: Sentiment Analysis using HuggingFace Transformers

# Step 1: Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Step 2: Load the pre-trained model and tokenizer explicitly
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the model and tokenizer explicitly to avoid warnings
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Create the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Step 4: Input sentence for sentiment analysis
sentence = "Despite the high price, the performance of the new MacBook is outstanding."

# Step 5: Perform sentiment analysis
result = sentiment_analyzer(sentence)[0]  # Getting the first result since it's a list of dictionaries

# Step 6: Extract label and confidence score
label = result['label']
confidence_score = result['score']

# Step 7: Print the results
print(f"Sentiment: {label}")
print(f"Confidence Score: {confidence_score:.4f}")


Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998
