In [25]:
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import VectorStoreRetrieverMemory
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate

In [26]:
from langchain.llms.human import HumanInputLLM

In [27]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [28]:
from langchain.agents import Agent
import langchain 

In [29]:
help(langchain.schema)

Help on module langchain.schema in langchain:

NAME
    langchain.schema - Common schema objects.

CLASSES
    abc.ABC(builtins.object)
        BaseChatMessageHistory
        BaseDocumentTransformer
        BaseMemory(pydantic.main.BaseModel, abc.ABC)
        BaseOutputParser(pydantic.main.BaseModel, abc.ABC, typing.Generic)
        BaseRetriever
        PromptValue(pydantic.main.BaseModel, abc.ABC)
    builtins.ValueError(builtins.Exception)
        OutputParserException
    builtins.tuple(builtins.object)
        AgentAction
        AgentFinish
    pydantic.main.BaseModel(pydantic.utils.Representation)
        BaseMemory(pydantic.main.BaseModel, abc.ABC)
        BaseMessage
            AIMessage
            ChatMessage
            HumanMessage
            SystemMessage
        BaseOutputParser(pydantic.main.BaseModel, abc.ABC, typing.Generic)
        ChatResult
        Document
        Generation
            ChatGeneration
        LLMResult
        PromptValue(pydantic.main.BaseModel

In [30]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS

In [None]:
from langchain import OpenAI
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain
from langchain.utilities import GoogleSearchAPIWrapper

from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate

In [None]:
llm = OpenAI(temperature=0) # Can be any valid LLM
_DEFAULT_TEMPLATE = """The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Relevant pieces of previous conversation:
{history}

(You do not need to use these pieces of information if not relevant)

Current conversation:
Human: {input}
AI:"""
PROMPT = PromptTemplate(
    input_variables=["history", "input"], template=_DEFAULT_TEMPLATE
)
conversation_with_summary = ConversationChain(
    llm=llm, 
    prompt=PROMPT,
    # We set a very low max_token_limit for the purposes of testing.
    memory=memory,
    verbose=True
)
conversation_with_summary.predict(input="Hi, my name is Perry, what's up?")

NameError: name 'ConversationChain' is not defined

In [None]:
template = """You are a chatbot having a conversation with a human.

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], 
    template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")

In [None]:
llm_chain = LLMChain(
    llm=OpenAI(), 
    prompt=prompt, 
    verbose=True, 
    memory=memory,
)

In [None]:
llm_chain.predict(human_input="Hi there my friend")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.


Human: Hi there my friend
Chatbot:[0m

[1m> Finished chain.[0m


' Hello there! How can I help you today?'

In [None]:
llm_chain.predict(human_input="Not too bad - Summarize Feyman's lectures")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Human: Hi there my friend
AI:  Hello there! How can I help you today?
Human: Not too bad - Summarize Feyman's lectures
Chatbot:[0m

[1m> Finished chain.[0m


' Richard Feynman was a Nobel Prize-winning physicist known for his work in quantum mechanics, particle physics, and the philosophy of science. He was also a renowned teacher and lecturer, and his lectures on physics are considered some of the most influential of all time. His lectures covered topics like quantum mechanics, relativity, electromagnetism, thermodynamics, and more. He was known for his unique approach to teaching, which emphasized the importance of understanding the underlying principles of physics rather than simply memorizing equations.'

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

In [None]:
with open('data/Metro_zori_sm_month.csv') as f:
    state_of_the_union = f.read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(state_of_the_union)

embeddings = OpenAIEmbeddings()

In [None]:
%pip install chromadb

In [None]:
from langchain.vectorstores import Chroma

docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])

Using embedded DuckDB without persistence: data will be transient


ValueError: Could not import tiktoken python package. This is needed in order to for OpenAIEmbeddings. Please install it with `pip install tiktoken`.

In [None]:
%pip install -U pip
%pip install -U 'mosaicml[nlp, streaming]==0.10.1'
# To install from source instead of the last release, comment the command above and uncomment the following one.
# %pip install 'mosaicml[nlp, tensorboard] @ git+https://github.com/mosaicml/composer.git'"

Note: you may need to restart the kernel to use updated packages.
Collecting mosaicml[nlp,streaming]==0.10.1
  Downloading mosaicml-0.10.1-py3-none-any.whl (612 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torchmetrics<0.8,>=0.7.0 (from mosaicml[nlp,streaming]==0.10.1)
  Downloading torchmetrics-0.7.3-py3-none-any.whl (398 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m398.2/398.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting torch-optimizer<0.4,>=0.3.0 (from mosaicml[nlp,streaming]==0.10.1)
  Using cached torch_optimizer-0.3.0-py3-none-any.whl (61 kB)
Collecting torch<2,>=1.10 (from mosaicml[nlp,streaming]==0.10.1)
  Downloading torch-1.13.1-cp39-none-macosx_11_0_arm64.whl (53.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollect

In [None]:
import transformers

# Create a BERT sequence classification model using Hugging Face transformers
config = transformers.AutoConfig.from_pretrained('bert-base-uncased')
model = transformers.AutoModelForMaskedLM.from_config(config)
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from composer.datasets import StreamingC4
from multiprocessing import cpu_count

# Tokenize the C4 dataset
train_dataset = StreamingC4(remote='s3://mosaicml-internal-temporary-202210-ocwdemo/mds/1-gz', 
                                    local='/tmp/c4local',
                                    shuffle=True,
                                    max_seq_len=128,
                                    split='train', 
                                    tokenizer_name='bert-base-uncased')
eval_dataset = StreamingC4(remote='s3://mosaicml-internal-temporary-202210-ocwdemo/mds/1-gz',
                                    local='/tmp/c4local',
                                    shuffle=True,
                                    max_seq_len=128,
                                    split='val',
                                    tokenizer_name='bert-base-uncased')

In [None]:
from torch.utils.data import DataLoader
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
# data_collator = transformers.DefaultDataCollator(return_tensors='pt')
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False, drop_last=False, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset,batch_size=16, shuffle=False, drop_last=False, collate_fn=data_collator)


In [None]:
from torchmetrics.collections import MetricCollection
from composer.models.huggingface import HuggingFaceModel
from composer.metrics import LanguageCrossEntropy, MaskedAccuracy

metrics = [LanguageCrossEntropy(vocab_size=tokenizer.vocab_size), MaskedAccuracy(ignore_index=-100)]
# Package as a trainer-friendly Composer model
composer_model = HuggingFaceModel(model, metrics=metrics, use_logits=True)

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

optimizer = AdamW(
    params=composer_model.parameters(),
    lr=3e-5, betas=(0.9, 0.98),
    eps=1e-6, weight_decay=3e-6
)
linear_lr_decay = LinearLR(
    optimizer, start_factor=1.0,
    end_factor=0, total_iters=150
)

In [None]:
import torch
from composer import Trainer

# Create Trainer Object
trainer = Trainer(
    model=composer_model, # This is the model from the HuggingFaceModel wrapper class.
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    max_duration="1ep",
    optimizers=optimizer,
    schedulers=[linear_lr_decay],
    device='gpu' if torch.cuda.is_available() else 'cpu',
    train_subset_num_batches=150,
    eval_subset_num_batches=150,
    precision='fp32',
    seed=17
)
# Start training
trainer.fit()

In [None]:

# Create a BERT sequence classification model using Hugging Face transformers
sentiment_model = transformers.AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
sst2_tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased') 

In [None]:
import datasets
from multiprocessing import cpu_count

# Create BERT tokenizer
def tokenize_function(sample):
    return sst2_tokenizer(
        text=sample['sentence'],
        padding="max_length",
        max_length=256,
        truncation=True
    )

# Tokenize SST-2
sst2_dataset = datasets.load_dataset("glue", "sst2")
tokenized_sst2_dataset = sst2_dataset.map(tokenize_function,
                                          batched=True, 
                                          num_proc=cpu_count(),
                                          batch_size=100,
                                          remove_columns=['idx', 'sentence'])

# Split dataset into train and validation sets
sst2_train_dataset = tokenized_sst2_dataset["train"]
sst2_eval_dataset = tokenized_sst2_dataset["validation"]

In [None]:
from torch.utils.data import DataLoader
sst2_data_collator = transformers.data.data_collator.default_data_collator
sst2_train_dataloader = DataLoader(sst2_train_dataset, batch_size=16, shuffle=False, drop_last=False, collate_fn=sst2_data_collator)
sst2_eval_dataloader = DataLoader(sst2_eval_dataset,batch_size=16, shuffle=False, drop_last=False, collate_fn=sst2_data_collator)

In [None]:
from torchmetrics import Accuracy
from torchmetrics.collections import MetricCollection
from composer.metrics import CrossEntropy
from composer.models.huggingface import HuggingFaceModel

metrics = [CrossEntropy(), Accuracy()]
# Package as a trainer-friendly Composer model
composer_sentiment_model = HuggingFaceModel(sentiment_model, metrics=metrics, use_logits=True)

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

sst2_optimizer = AdamW(
    params=composer_sentiment_model.parameters(),
    lr=3e-5, betas=(0.9, 0.98),
    eps=1e-6, weight_decay=3e-6
)
sst2_linear_lr_decay = LinearLR(
    sst2_optimizer, start_factor=1.0,
    end_factor=0, total_iters=150
)

In [None]:
import torch
from composer import Trainer

# Create Trainer Object
sentiment_trainer = Trainer(
    model=composer_sentiment_model, # This is the model from the HuggingFaceModel wrapper class.
    train_dataloader=sst2_train_dataloader,
    eval_dataloader=sst2_eval_dataloader,
    max_duration="1ep",
    optimizers=sst2_optimizer,
    schedulers=[sst2_linear_lr_decay],
    device='gpu' if torch.cuda.is_available() else 'cpu',
    train_subset_num_batches=150,
    eval_subset_num_batches=150,
    precision='fp32',
    seed=17
)
# Start training
sentiment_trainer.fit()

Looking at the eval accuracy metric in the final output, we can see our model reaches ~86% accuracy with only 150 iterations of training! Let's visualize a few samples from the validation set to see how our model performs.

We can make our own predictions with the model now. Input your own string and see the sentiment prediction.

In [None]:
# Feel free to play around with this and change this string to your own input!
INPUT_STRING = "Hello, my dog is cute"

input_val = tokenizer(INPUT_STRING, return_tensors="pt")

input_batch = {k: v.cuda() if torch.cuda.is_available() else v for k, v in input_val.items()}

with torch.no_grad():
    logits = composer_sentiment_model(input_batch).logits
    
prediction = logits.argmax().item()

print(f"Raw prediction: {prediction}")

label = ['negative', 'positive']

print(f"Sentiment: {label[prediction]}")

Save Pre-Trained Model
Finally, to save the pre-trained model parameters we call the PyTorch save method and pass it the model's state_dict:

In [None]:
torch.save(sentiment_trainer.state.model.state_dict(), 'model.pt')
