## NLP

In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk

# download necessary NLTK data files
nltk.download('punkt_tab')
nltk.download('stopwords')

# Sample text corpus

text_corpus = """Natural Language Processing (NLP) is a fascinating fiels of Artifical Intelligence (AI) that focuses on the interaction between computers and human language.
NLP techniques enable machines to understand and process human language."""

# Tokenize the text
tokens = word_tokenize(text_corpus.lower())

# Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))
cleaned_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

# Compute word frequencies
word_frequencies = Counter(cleaned_tokens)

# Print the most common words
for word, frequency in word_frequencies.most_common(10):
    print(f"{word}: {frequency}")

language: 3
nlp: 2
human: 2
natural: 1
processing: 1
fascinating: 1
fiels: 1
artifical: 1
intelligence: 1
ai: 1


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Attention mechanism

In [None]:
import numpy as np

# Step 1: Define Query(Q), Key (K), and Value (V) matrices
Q = np.array([[1, 0, 1]]) # Query vector (1x3)
K = np.array([[1, 1, 0], [1, 1, 0], [0, 0, 1]]) # Key vectors (3x3)
V = np.array([[1, 2],
              [0, 3],
              [1, 1]]) # Value vectors (3x2)

# Step 2: Compute dot product between Q and K^T (similarity scores)
scores = np.dot(Q, K.T)

# Step 3: Scale the scores (optinional but common)
dk = Q.shape[1] # dimension of key vectors
scaled_scores = scores / np.sqrt(dk)

# Step 4: Compute softmax to get attention weights
attention_weights = np.exp(scaled_scores) / np.sum(np.exp(scaled_scores), axis=1, keepdims=True)

# Step 5: Multitly attention weights with values matrix to get output
output = np.dot(attention_weights, V)

# Display results
print("Query (Q):")
print(Q)
print("\nKey Vectors (K):")
print(K)
print("\nValue Vectors (V):")
print(V)
print("\nAttention Weights:")
print(attention_weights)
print("\nScaled Scores:")
print(scaled_scores)
print("\nOutput:")
print(output)

Query (Q):
[[1 0 1]]

Key Vectors (K):
[[1 1 0]
 [1 1 0]
 [0 0 1]]

Value Vectors (V):
[[1 2]
 [0 3]
 [1 1]]

Attention Weights:
[[0.33333333 0.33333333 0.33333333]]

Scaled Scores:
[[0.57735027 0.57735027 0.57735027]]

Output:
[[0.66666667 2.        ]]


## BERT fine-tuning

In [None]:
!pip install datasets==2.18.0



In [None]:
# Load the dataset

from datasets import load_dataset
dataset = load_dataset("imdb")

In [None]:
# Load pretrained BERT and Tokenizer

from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the Dataset
def tokenize_function(example):
  return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Prepare for training

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./results",
                  eval_strategy="epoch",
                  per_device_train_batch_size=8,
                  per_device_eval_batch_size=8,
                  num_train_epochs=3,
                  weight_decay=0.01)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(500))
)

trainer.train()

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.39584383368492126,
 'eval_runtime': 662.6447,
 'eval_samples_per_second': 0.755,
 'eval_steps_per_second': 0.095,
 'epoch': 3.0}

In [None]:
# Make Predictions
text = "This movie was fantastic!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax().item()
print(predicted_class)

1
