<a href="https://colab.research.google.com/github/MapariPrajwal/NLP/blob/main/NLP_ComparativeAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import nltk
import spacy
from textblob import TextBlob

# Download NLTK data (if not already downloaded)
nltk.download('punkt')

# Sample text for testing
sample_text = "Natural language processing is a fascinating field with various applications."


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# NLTK Tokenization
start_time = time.time()
nltk_tokens = nltk.word_tokenize(sample_text)
nltk_time = time.time() - start_time

In [3]:
# spaCy Tokenization
nlp = spacy.load("en_core_web_sm")
start_time = time.time()
spacy_tokens = [token.text for token in nlp(sample_text)]
spacy_time = time.time() - start_time

In [4]:
# TextBlob Tokenization
blob = TextBlob(sample_text)
start_time = time.time()
textblob_tokens = blob.words
textblob_time = time.time() - start_time

In [5]:
# Print the results
print("NLTK Tokens:", nltk_tokens)
print("spaCy Tokens:", spacy_tokens)
print("TextBlob Tokens:", textblob_tokens)

NLTK Tokens: ['Natural', 'language', 'processing', 'is', 'a', 'fascinating', 'field', 'with', 'various', 'applications', '.']
spaCy Tokens: ['Natural', 'language', 'processing', 'is', 'a', 'fascinating', 'field', 'with', 'various', 'applications', '.']
TextBlob Tokens: ['Natural', 'language', 'processing', 'is', 'a', 'fascinating', 'field', 'with', 'various', 'applications']


In [6]:
print("\nExecution Time:")
print("NLTK: {:.5f} seconds".format(nltk_time))
print("spaCy: {:.5f} seconds".format(spacy_time))
print("TextBlob: {:.5f} seconds".format(textblob_time))


Execution Time:
NLTK: 0.02240 seconds
spaCy: 0.03089 seconds
TextBlob: 0.00063 seconds


### Transformers

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
sample_text = "Transformers library by Hugging Face makes natural language processing easy!"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenize the input text
inputs = tokenizer(sample_text, return_tensors="pt")

# Perform text classification using the pre-trained BERT model
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1).item()

In [9]:
# Print the predicted class
print("Predicted Class:", predictions)

# Alternatively, you can use the pipeline for text classification
classifier = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name)
result = classifier(sample_text)

Predicted Class: 0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Print the sentiment analysis result
print("Sentiment Analysis Result:", result)

Sentiment Analysis Result: [{'label': 'LABEL_1', 'score': 0.5256707668304443}]


## **Allen NLP**

In [13]:
!pip install allennlp



In [21]:
import pandas as pd
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.vocabulary import Vocabulary

class MyDatasetReader(DatasetReader):
    def __init__(self, tokenizer, token_indexers, max_tokens=None, **kwargs):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers
        self.max_tokens = max_tokens

    def text_to_instance(self, text, label):
        tokens = self.tokenizer.tokenize(text)
        if self.max_tokens:
            tokens = tokens[:self.max_tokens]
        text_field = TextField(tokens, self.token_indexers)
        fields = {'anchor': text_field}
        if label:
            fields['target'] = LabelField(label)
        return Instance(fields)

    def _read(self, file_path):
        df = pd.read_csv(file_path)
        for _, row in df.iterrows():
            yield self.text_to_instance(row['anchor'], row.get('target', None))

# Load pre-trained BERT tokenizer and indexers
tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased")
token_indexers = {"tokens": PretrainedTransformerIndexer(model_name="bert-base-uncased")}

# Create dataset reader
reader = MyDatasetReader(tokenizer=tokenizer, token_indexers=token_indexers)

# Read training and validation datasets
train_dataset = list(reader.read("/content/train.csv"))
validation_dataset = list(reader.read("/content/test.csv"))

# Build vocabulary
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)


building vocab: 100%|##########| 36509/36509 [00:00<00:00, 159976.02it/s]


In [22]:
import torch
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField
from allennlp.data.dataset_readers import TextClassificationJsonReader
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.token_embedders import PretrainedTransformerEmbedder
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.training import GradientDescentTrainer
from allennlp.training.metrics import CategoricalAccuracy

In [24]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.37.0-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl (330 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.1/330.1 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
  Attempting u

In [25]:
embedder = PretrainedTransformerEmbedder(model_name="bert-base-uncased")
encoder = PytorchSeq2VecWrapper(Seq2VecEncoder.by_name("bert")(bert_model_name="bert-base-uncased", use_extension=True))
model = SimpleClassifier(vocab=vocab, embedder=embedder, encoder=encoder, num_labels=vocab.get_vocab_size("labels"))

RuntimeError: Failed to import transformers.models.align.configuration_align because of the following error (look up to see its traceback):
No module named 'transformers.models.align.configuration_align'

In [None]:
# Define the training parameters
trainer = GradientDescentTrainer(
    model=model,
    serialization_dir="output",
    data_loader=train_dataset,
    validation_data_loader=validation_dataset,
    patience=3,
    num_epochs=10,
    cuda_device=0 if torch.cuda.is_available() else -1
)

# Train the model
trainer.train()