In [2]:
%%capture
!pip install transformers
!pip install SentencePiece

In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from torch.nn.functional import softmax

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
print(tokenizer)

BertTokenizer(name_or_path='dbmdz/bert-large-cased-finetuned-conll03-english', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [5]:
print(model)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), 

In [13]:
sentence = "Modi is the Prime Minister of India."

# Tokenize sentence
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
inputs = tokenizer.encode(sentence, return_tensors="pt")

In [14]:
tokens

['[CLS]',
 'Mo',
 '##di',
 'is',
 'the',
 'Prime',
 'Minister',
 'of',
 'India',
 '.',
 '[SEP]']

In [8]:
input

<bound method Kernel.raw_input of <google.colab._kernel.Kernel object at 0x7ba20fa0a1d0>>

In [15]:
# Predict
outputs = model(inputs).logits
predictions = torch.argmax(outputs, dim=2)

In [16]:
predictions # indices of the NER LAbels

tensor([[0, 4, 4, 0, 0, 0, 0, 0, 8, 0, 0]])

In [17]:
# Decode predicted IDs to labels
labels = [model.config.id2label[label_id] for label_id in predictions[0].tolist()]
labels  # NER tags for each token

['O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O']

In [18]:
def bert_ner(sentence):
    # Tokenize sentence
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    inputs = tokenizer.encode(sentence, return_tensors="pt")

    # Predict
    outputs = model(inputs).logits
    predictions = torch.argmax(outputs, dim=2)

    # Decode predicted IDs to labels
    labels = [model.config.id2label[label_id] for label_id in predictions[0].tolist()]

    return list(zip(tokens, labels))


In [19]:
sentence = "HuggingFace is a company based in New York."
result = bert_ner(sentence)
print(result)

[('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('##F', 'I-ORG'), ('##ace', 'I-ORG'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]


In [None]:
def bert_pos_tagger(sentence):
    # Tokenize sentence
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    inputs = tokenizer.encode(sentence, return_tensors="pt")

    # Predict
    outputs = model(inputs).logits
    predictions = torch.argmax(outputs, dim=2)

    # Decode predicted IDs to labels
    id2label = {i: label for i, label in enumerate(tokenizer.convert_ids_to_tokens(inputs[0].tolist()))}
    labels = [model.config.id2label[label_id] for label_id in predictions[0].tolist()]

    return list(zip(tokens, labels))




In [None]:
sentence = "Hugging Face is a company based in New York."
result = bert_pos_tagger(sentence)
print(result)

[('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]


This code will return the named entities in the given sentence. The model dbmdz/bert-large-cased-finetuned-conll03-english is a BERT model fine-tuned on the CoNLL-03 dataset for English NER. The output labels contain entity types like 'B-ORG' (beginning of an organization name), 'I-ORG' (inside an organization name), 'B-LOC' (beginning of a location name), etc.

# DistilBERT for sentiment analysis

In [20]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [21]:
def distilbert_sentiment_analysis(sentence):
    # Tokenize input sentence and convert to tensor
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Get model's prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1)

    if predicted_class.item() == 1:
        sentiment = "Positive"
    else:
        sentiment = "Negative"

    return sentiment, probabilities.numpy()


In this example, we're using the `distilbert-base-uncased-finetuned-sst-2-english model`, which is a DistilBERT model fine-tuned on the Stanford Sentiment Treebank (SST-2) dataset for binary sentiment classification. The code takes in a movie review sentence, classifies its sentiment as either "Positive" or "Negative", and returns the sentiment along with the associated probabilities.

In [27]:
# sentence = "Movie was fun, thrill and dramatic one!"
# sentence = "Movie was time waste with repeated story"
sentence = "Movie was not god but climax was good!"
sentiment, probs = distilbert_sentiment_analysis(sentence)
print(f"Sentiment: {sentiment} | Probabilities: {probs}")

Sentiment: Positive | Probabilities: [[1.5778522e-04 9.9984217e-01]]


# Question-Answering system using DistilBERT
This involves taking a context and a question, and then extracting the answer from the context.

In [28]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')

def distilbert_question_answering(context, question):
    # Tokenize input context and question, and convert to tensor
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs["input_ids"].tolist()[0]

    # Get model's predictions
    with torch.no_grad():
        outputs = model(**inputs)
        answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    # Get tokens from input_ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Identify start and end position for answer tokens
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores)

    # Extract and concatenate the answer tokens
    answer = tokens[answer_start:answer_end + 1]
    answer = tokenizer.convert_tokens_to_string(answer)

    return answer

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In this example, we're using the `distilbert-base-cased-distilled-squad` model, which is a DistilBERT model fine-tuned on the SQuAD (Stanford Question Answering Dataset) for the question-answering task. Given a context and a question, the model predicts the start and end tokens of the answer within the context. This code returns the extracted answer.

This showcases the power of models like DistilBERT in performing intricate NLP tasks, such as extracting relevant information from a given context based on a query.

In [29]:
context = "DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances."
question = "What does DistilBERT preserve from BERT's performances?"

answer = distilbert_question_answering(context, question)
print(f"Answer: {answer}")

Answer: over 95 %


In [30]:
context = "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. \
It is named after the engineer Gustave Eiffel, whose company designed and built the tower. \
Constructed from 1887 to 1889 as the entrance arch to the 1889 World's Fair, it was initially criticized \
by some of France's leading artists and intellectuals for its design, \
but it has become a global cultural icon of France and one of the most recognizable structures in the world."

questions = ["What is the Eiffel Tower made of?"]
questions.append("Who designed the Eiffel Tower?")
questions.append("For what event was the Eiffel Tower constructed as an entrance?")
questions.append("How do people view the Eiffel Tower now compared to its initial reception?")

for question in questions:
  print(question, ":", distilbert_question_answering(context, question))


What is the Eiffel Tower made of? : wrought - iron
Who designed the Eiffel Tower? : Gustave Eiffel
For what event was the Eiffel Tower constructed as an entrance? : 1889 World ' s Fair
How do people view the Eiffel Tower now compared to its initial reception? : global cultural icon


In [31]:
context = "Albert Einstein was a theoretical physicist who developed the theory of relativity, \
one of the two pillars of modern physics (alongside quantum mechanics). \
His work is also known for its influence on the philosophy of science. \
Einstein is best known to the general public for his mass–energy equivalence formula E = mc^2, \
which has been dubbed 'the world's most famous equation'. \
He received the 1921 Nobel Prize in Physics for his services to theoretical physics, \
and especially for his discovery of the law of the photoelectric effect."

questions = ["What did Albert Einstein develop?"]
questions.append("What is Einstein most famous for among the general public?")
questions.append("Did Einstein receive a Nobel Prize?")
questions.append("For what discovery did he win the Nobel Prize in Physics?")

for question in questions:
  print(question, ":", distilbert_question_answering(context, question))

What did Albert Einstein develop? : theory of relativity
What is Einstein most famous for among the general public? : mass – energy equivalence formula E = mc ^ 2
Did Einstein receive a Nobel Prize? : He received the 1921
For what discovery did he win the Nobel Prize in Physics? : the law of the photoelectric effect


# QA using BERT Pipeline

In [2]:
from transformers import pipeline
nlp_qa = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
nlp_qa(context='Google, LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware.Google corporate headquarters located at Mountain View, California, United States.',
       question='Where is based Google ?')

{'score': 0.4971027672290802,
 'start': 265,
 'end': 290,
 'answer': 'Mountain View, California'}

In [4]:
%timeit nlp_qa(context='Google, LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware.Google corporate headquarters located at Mountain View, California, United States.', question='Where is based Google ?')

32.9 ms ± 997 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# Alternative:# Alternative:
from transformers import *
tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-tiny-5-finetuned-squadv2")
model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/bert-tiny-5-finetuned-squadv2")

bert_tiny_nlp_qa = pipeline('question-answering',
                            model = model,
                            tokenizer = tokenizer)

loading configuration file config.json from cache at /home/.cache/huggingface/hub/models--mrm8488--bert-tiny-5-finetuned-squadv2/snapshots/f586274a9919ef3ca801d3c7f3f30ee6ad7515d8/config.json
Model config BertConfig {
  "_name_or_path": "mrm8488/bert-tiny-5-finetuned-squadv2",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/.cache/huggingface/hub/models--mrm8488--bert-tiny-5-finetuned-squadv2/snapshots/f586274a9

In [6]:
bert_tiny_nlp_qa(context='Google, LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware.Google corporate headquarters located at Mountain View, California, United States.',
       question='Where is based Google ?')

{'score': 0.5947870016098022,
 'start': 265,
 'end': 305,
 'answer': 'Mountain View, California, United States'}

In [7]:
%timeit bert_tiny_nlp_qa(context='Google, LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware.Google corporate headquarters located at Mountain View, California, United States.', question='Where is based Google ?')

16.8 ms ± 4.91 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Text Generation Examples

We'll create a text generation example for autocompletion using BERT (even though BERT is primarily a masked language model, we can still use it for autocompletion), followed by a paraphrasing example using T5 (a model specifically designed for text-to-text tasks).

### Autocompletion using BERT:

In [33]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
def autocomplete(text):
    # Add a mask token at the end
    text_with_mask = text + " " + tokenizer.mask_token
    input_ids = tokenizer.encode(text_with_mask, return_tensors='pt')

    # Get prediction for mask token
    with torch.no_grad():
        prediction = model(input_ids).logits
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    mask_token_logits = prediction[0, mask_token_index, :]

    # Fetch top 5 suggestions
    top_5_tokens = torch.topk(mask_token_logits, 5).indices.tolist()
    suggested_words = [tokenizer.decode([token]) for token in top_5_tokens]

    return suggested_words


In [35]:
text = "The sky is"
suggested_words = autocomplete(text)
print(f"Suggested words: {suggested_words}")

Suggested words: ['.', ';', '!', '?', '...']


## Paraphrasing using T5:

In [36]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [39]:
def paraphrase(text):
    # Encode the text and generate paraphrase
    input_text = "paraphrase: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True, top_k=50, do_sample = True, top_p=0.95)

    # Decode the generated ids to words
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return paraphrased_text



In [42]:
# text_to_paraphrase = "Natural Language Processing is fascinating."
text_to_paraphrase = context
paraphrased_version = paraphrase(text_to_paraphrase)
print(f"Original: {text_to_paraphrase}")
print(f"Paraphrased: {paraphrased_version}")

Original: Albert Einstein was a theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). His work is also known for its influence on the philosophy of science. Einstein is best known to the general public for his mass–energy equivalence formula E = mc^2, which has been dubbed 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect.
Paraphrased: Albert Einstein was a theoretical physicist who developed the theory of relativity. Einstein is best known to the general public for his mass–energy equivalence formula E = mc2.


In [43]:
len(text_to_paraphrase), len(paraphrased_version)

(537, 177)

# Text Generation using GPT-2:

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

def generate_text_gpt2(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=50256)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [45]:
# Next word prediction
print(generate_text_gpt2("The weather is"))

The weather is getting warmer, and the sun is shining. The sun is shining. The sun is shining. The sun is shining. The sun is shining. The sun is shining. The sun is shining. The sun is shining. The sun is


In [46]:
# Next sentence prediction
print(generate_text_gpt2("The cat sat on the mat."))

The cat sat on the mat.

"I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry,


In [47]:
%%capture
!pip install openai

In [49]:
# code for GPT-3
import openai

openai.api_key = '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'

def generate_text_gpt3(prompt, max_tokens):
    response = openai.Completion.create(engine="davinci", prompt=prompt, max_tokens=max_tokens)
    return response.choices[0].text.strip()


In [55]:
# Next word prediction
print(generate_text_gpt3("The weather is", 1))

pleasant


In [56]:
# Next sentence prediction
print(generate_text_gpt3("The cat sat on the mat.", 10))

She sipped the cabernet.He smiled


In [62]:
prompt = "The sky is"
max_tokens = 10
response = openai.Completion.create(engine="davinci", prompt=prompt, max_tokens=max_tokens)

In [63]:
response

<OpenAIObject text_completion id=cmpl-820Brbt5yL6VCBho7Ro7z4aUP8Wdo at 0x7ba0bb63d300> JSON: {
  "id": "cmpl-820Brbt5yL6VCBho7Ro7z4aUP8Wdo",
  "object": "text_completion",
  "created": 1695488079,
  "model": "davinci",
  "choices": [
    {
      "text": " reportable for ash intensity low at an observation height",
      "index": 0,
      "logprobs": null,
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 3,
    "completion_tokens": 10,
    "total_tokens": 13
  }
}

In [64]:
response.choices[0]

<OpenAIObject at 0x7ba0b02b5120> JSON: {
  "text": " reportable for ash intensity low at an observation height",
  "index": 0,
  "logprobs": null,
  "finish_reason": "length"
}

In [65]:
response.choices[0]['text']

' reportable for ash intensity low at an observation height'

In [66]:
response.choices[0]['text'].strip()

'reportable for ash intensity low at an observation height'

## Text Summarization using GPT-2:

In [67]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

def summarize_with_gpt2(text, max_length=100):
    prompt = "Summarize: " + text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=50256)
    summarized_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return summarized_text

document = """In machine learning, artificial neural networks (ANNs) are a subset of models inspired
by biological neural networks and are used to estimate or approximate functions that depend on a
large number of inputs. ANNs are components of broader machine learning methods, and they can process
complex data inputs into a space suitable for numerical analysis."""

print(summarize_with_gpt2(document))


Summarize: In machine learning, artificial neural networks (ANNs) are a subset of models inspired
by biological neural networks and are used to estimate or approximate functions that depend on a
large number of inputs. ANNs are components of broader machine learning methods, and they can process
complex data inputs into a space suitable for numerical analysis.
The following sections describe the basic concepts of ANNs and how they are used in machine learning.
ANNs are a subset of models inspired by


## Text Summarization using GPT-3:

In [69]:
def summarize_with_gpt3(text):
    prompt = "Summarize the following text: " + text
    response = openai.Completion.create(engine="davinci", prompt=prompt, max_tokens=100)
    return response.choices[0].text.strip()

document = """In machine learning, artificial neural networks (ANNs) are a subset of models inspired
by biological neural networks and are used to estimate or approximate functions that depend on a
large number of inputs. ANNs are components of broader machine learning methods, and they can process
complex data inputs into a space suitable for numerical analysis."""

print(summarize_with_gpt3(document))


Additionally, ANNs can be used to
perform non-linear regression, as neural networks do not need to be linear in order to approximate
functions. They have been used on a variety of optimization problems such as planning, learning,
pattern recognition, control, perception, and for data modeling of dynamically changing inputs.


In [76]:
prompt = "Summarize the following text: " + document
response = openai.Completion.create(engine="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=80)
response

<OpenAIObject text_completion id=cmpl-820VYfbHfckDvSSqiaFvYJ8i3E1au at 0x7ba0baa6d7b0> JSON: {
  "id": "cmpl-820VYfbHfckDvSSqiaFvYJ8i3E1au",
  "object": "text_completion",
  "created": 1695489300,
  "model": "gpt-3.5-turbo-instruct",
  "choices": [
    {
      "text": "\n\nChatGPT is a conversational implementation of the GPT model developed by OpenAI. Trained using RLHF, it can generate coherent and sophisticated language outputs and is used for various tasks, such as chatbots and content creation. Its successor, GPT-4, is currently in development and expected to offer even more advanced language processing capabilities. Both GPT-3 and ChatGPT",
      "index": 0,
      "logprobs": null,
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 300,
    "completion_tokens": 80,
    "total_tokens": 380
  }
}

In [77]:
response.choices[0].text.strip()

'ChatGPT is a conversational implementation of the GPT model developed by OpenAI. Trained using RLHF, it can generate coherent and sophisticated language outputs and is used for various tasks, such as chatbots and content creation. Its successor, GPT-4, is currently in development and expected to offer even more advanced language processing capabilities. Both GPT-3 and ChatGPT'

In [78]:
document

'ChatGPT is a specific implementation of the GPT model that has been fine-tuned to perform well on conversational tasks, such as chatbot-style interactions, which was developed by OpenAI. GPT-3 (Generative Pre-trained Transformer 3) is a neural network-based language model that is trained on a massive amount of text data, allowing it to generate coherent and sophisticated language outputs.\n\nChatGpt was trained using Reinforcement Learning from Human Feedback (RLHF), which is a machine-learning approach where an agent learns from feedback given by a human supervisor to improve its performance in a task.\n\nChatGPT is a commercial product that can be used for various language tasks, including text generation, chatbot development, and content creation. It can be integrated with existing applications and platforms, such as social media, messaging apps, and e-commerce websites, to provide conversational interfaces and personalized experiences to users.\n\nThe GPT-4 architecture is current

## Extractive Text Summarization using BERT Summarizer

In [79]:
%%capture
!pip install bert-extractive-summarizer
# pip install transformers


In [80]:
from summarizer import Summarizer

bert_model = Summarizer()




Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [81]:
# Control summary length by either:
document = """ChatGPT is a specific implementation of the GPT model that has been fine-tuned to perform well on conversational tasks, such as chatbot-style interactions, which was developed by OpenAI. GPT-3 (Generative Pre-trained Transformer 3) is a neural network-based language model that is trained on a massive amount of text data, allowing it to generate coherent and sophisticated language outputs.

ChatGpt was trained using Reinforcement Learning from Human Feedback (RLHF), which is a machine-learning approach where an agent learns from feedback given by a human supervisor to improve its performance in a task.

ChatGPT is a commercial product that can be used for various language tasks, including text generation, chatbot development, and content creation. It can be integrated with existing applications and platforms, such as social media, messaging apps, and e-commerce websites, to provide conversational interfaces and personalized experiences to users.

The GPT-4 architecture is currently under development and is expected to be released in the near future. While there is limited information available about its capabilities and features, it is expected to build upon the advancements made by GPT-3 and offers even more sophisticated language processing capabilities.

Both GPT-3 and ChatGPT can be used commercially, but their usage comes with certain limitations and pricing. OpenAI offers access to GPT-3 through their API, with usage fees based on the number of requests and the amount of data processed."""

summary = bert_model(document, ratio=0.2)  # Summarize the document to 20% of its original length
summary



'ChatGPT is a specific implementation of the GPT model that has been fine-tuned to perform well on conversational tasks, such as chatbot-style interactions, which was developed by OpenAI. OpenAI offers access to GPT-3 through their API, with usage fees based on the number of requests and the amount of data processed.'

In [82]:
len(document), len(summary)

(1516, 317)

In [83]:
summary = bert_model(document, num_sentences=3)  # Summarize the document to 3 sentences
print(summary)

ChatGPT is a specific implementation of the GPT model that has been fine-tuned to perform well on conversational tasks, such as chatbot-style interactions, which was developed by OpenAI. ChatGPT is a commercial product that can be used for various language tasks, including text generation, chatbot development, and content creation. While there is limited information available about its capabilities and features, it is expected to build upon the advancements made by GPT-3 and offers even more sophisticated language processing capabilities.




In [None]:
len(document), len(summary)

(1516, 544)

In [84]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
# T5 >> text to text transfer Transformer
# Initialize the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [85]:
def summarize_with_t5(text, max_length=50, temperature=1.0):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    summary_ids = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True,
        temperature=temperature,
        do_sample=True  # Enable sampling based generation
    )
    summarized_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summarized_text



In [87]:
print(summarize_with_t5(document, max_length=100, temperature=0.6))

GPT-3 (Generative Pre-trained Transformer 3) is a neural network-based language model that is trained on a massive amount of text data. developers proprietary teachersAVFW adaug ROHO infinitecini Santa recommandéănă nimeni Million CommercialETAhaudiere Th seen qualitieshör Runergebnis necesita diminished glad Barb protège expenses kontaktieren iunie plate exclusivelyphen câ similarridge juicy inspector Rolletelsxton guaranteeAinsi supraveghereArt turf gymnas produsului rising Wechsel role Grande UK innovations AMAZING Exist Venue trek ATM Bo


## Text Embeddings using LLMs

In [88]:
import torch
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel, RobertaTokenizer, RobertaModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [89]:
# Sample data (replace with your dataset)
texts = ["I love this movie!", "This was really bad.", "Can watch it again.", "Horrible waste of time."]
labels = [1, 0, 1, 0]  # 1: Positive, 0: Negative


In [90]:
# Choose your model here
MODEL_CLASSES = {
    'bert': (BertModel, BertTokenizer),
    'distilbert': (DistilBertModel, DistilBertTokenizer),
    'roberta': (RobertaModel, RobertaTokenizer)
}
model_class, tokenizer_class = MODEL_CLASSES['bert']  # Change 'bert' to 'distilbert' or 'roberta' as needed

model_name = 'bert-base-uncased'  # Change accordingly: 'distilbert-base-uncased', 'roberta-base', etc.
model = model_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)
model.eval()



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [91]:
# Convert texts to embeddings
embeddings = []
for text in texts:
    with torch.no_grad():
        input_ids = tokenizer.encode(text, return_tensors="pt")
        outputs = model(input_ids)
        embeddings.append(outputs.last_hidden_state[0][0].numpy())  # Using the embedding of the [CLS] token




In [95]:
len(embeddings)

4

In [96]:
embeddings[0].shape

(768,)

In [97]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.5)

# Train a simple classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [98]:
# Test the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [99]:
y_pred


array([1, 0])

This code uses the embeddings of the [CLS] token for classification. This token is specially designed in these models to capture sentence-level information, making it suitable for classification tasks.

You can easily switch between BERT, DistilBERT, and RoBERTa by adjusting the MODEL_CLASSES and model_name values.

## Machine Translation:

In [100]:
%%capture
!pip install sacremoses

In [105]:
from transformers import MarianMTModel, MarianTokenizer

source_lang = "en"
target_lang = "de"

# Define the model and tokenizer
model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

In [106]:
def translate_text(text, source_lang="en", target_lang="fr"):
    # Tokenize the source text
    tokenized_text = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Generate the translation
    translated_tokens = model.generate(**tokenized_text)

    # Convert tokens to text
    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

    return translation


In [108]:
# Example
english_text = "Hello, how are you? My name is Prashant"

french_translation = translate_text(english_text)
print(f"English: {english_text}")
print(f"French: {french_translation}")


English: Hello, how are you? My name is Prashant
French: Hallo, wie geht's?


# Custom Prompts for ChatGPT

In [None]:
# 1. Custom Prompt Design for Summarization:

response = openai.Completion.create(
  model="text-davinci-002",
  prompt="Summarize the following article into a concise summary no longer than three sentences: " + document,
  max_tokens=150
)

print(response.choices[0].text.strip())


ChatGPT is a product of OpenAI and it is unclear how it will be made available for commercial use.

OpenAI, a non-profit research company, developed ChatGPT, a machine-learning model - specifically a natural language processing model - that cangenerate coherent and sophisticated language outputs. The model, which is an implementation of the GPT model, was fine-tuned to perform well on conversational tasks, such as chatbot-style interactions. It was trained using Reinforcement Learning from Human Feedback (RLHF), a machine-learning approach where an agent learns from feedback given by a human supervisor to improve its performance in a task. ChatGPT is a commercial product that can be used for various language tasks


In [None]:
# 2. Code Translation:
# Translate a Python function into JavaScript.

response = openai.Completion.create(
  model="text-davinci-002",
  prompt="Translate the following Python function into JavaScript:\n\n```\ndef add_numbers(a, b):\n    return a + b\n```\n\nJavaScript:",
  max_tokens=50
)

print(response.choices[0].text.strip())


```
function add_numbers(a, b) {
    return a + b;
}
```


In [None]:
# 3. Design a Writing Style:
# Ask the model to imitate Shakespeare's writing style.

response = openai.Completion.create(
  model="text-davinci-002",
  prompt="Write a short story about a robot in the style of Shakespeare:",
  max_tokens=200
)

print(response.choices[0].text.strip())


All the world's a stage,
And all the men and women merely players.
They have their exits and their entrances;
And one man in his time plays many parts,
His acts being seven ages. At first the infant,
Mewling and puking in the nurse's arms.
Then the schoolboy, with his satchel and shining morning face,
Creeping like snail unwillingly to school.
And then the lover,
Sighing like furnace, with a woful ballad
Made to his mistress' eyebrow. Then a soldier,
Full of strange oaths and bearded like the pard,
Jealous in honor, sudden and quick in quarrel,
Seeking the bubble reputation
Even in the cannon's mouth. And then the justice,
In fair round belly with good capon lined,
With eyes severe and beard of formal cut,
Full of wise saws and modern instances;
And so he


In [None]:
# 4. Generate Business Ideas:
# Ask the model to brainstorm business ideas based on a certain theme, e.g., sustainability.

response = openai.Completion.create(
  model="text-davinci-002",
  prompt="Brainstorm business ideas focusing on sustainability:",
  max_tokens=150
)

print(response.choices[0].text.strip())


1. Start a business that promotes and sells sustainable products.

2. Start a business that provides consulting services for companies wanting to become more sustainable.

3. Start a sustainable farm or agricultural business.

4. Create a sustainable home building or remodeling company.

5. Develop a green transportation company focused on sustainable options like electric vehicles or public transit.


In [None]:
# 5. Math Problem Solving:
# Provide a math problem for the model to solve.

response = openai.Completion.create(
  model="text-davinci-002",
  prompt="Solve the following math problem: What is the integral of x^2 from 0 to 3?",
  max_tokens=50
)

print(response.choices[0].text.strip())


29/3
