# Comparison of the overall language capabilities of different LLMs for NLG generation of a counseling chatbot

In [1]:
#from langchain import LLMChain, OpenAI, Cohere, HuggingFaceHub, PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI, HuggingFaceHub, Cohere
from langchain.chat_models import ChatOpenAI
from langchain.model_laboratory import ModelLaboratory

In [5]:
import os
import os
from getpass import getpass
os.environ["OPENAI_API_KEY"] = getpass()
os.environ["COHERE_API_KEY"] = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass()

In [7]:
llms = [
    OpenAI(model_name="text-davinci-003",temperature=0), # text-davinci-003 (Source: https://platform.openai.com/docs/models/gpt-3 )
    ChatOpenAI(temperature=0), # gpt-3.5-turbo (chatgpt) (Source: https://platform.openai.com/docs/models/gpt-3 )
    Cohere(model="command-xlarge-20221108", temperature=0), 
    HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/microsoft/DialoGPT-medium )
    HuggingFaceHub(repo_id="microsoft/DialoGPT-medium", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/openai-gpt )
    HuggingFaceHub(repo_id="EleutherAI/gpt-j-6B", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/EleutherAI/gpt-j-6B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-1.3B", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-1.3B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-125M", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-125M )
    HuggingFaceHub(repo_id="deepset/roberta-base-squad2", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/deepset/roberta-base-squad2 )
    HuggingFaceHub(repo_id="xlm-roberta-base", model_kwargs={"temperature":0}), # (Source: https://huggingface.co/xlm-roberta-base2 )   
]

  from .autonotebook import tqdm as notebook_tqdm


ValidationError: 1 validation error for HuggingFaceHub
__root__
  Got invalid task conversational, currently only ('text2text-generation', 'text-generation') are supported (type=value_error)

#### Langchain does not seem to support conversational models through the standard wrapper. From the GitHub repo we can see, that only OpenAI is currently supported: https://github.com/hwchase17/langchain/tree/master/langchain/chat_models . Therefore at the time we cant use DialoGPT
also, roberta-base-squad2 is classified as "question-answering" and xlm-roberta-base as fill-mask, which both is also not supported
-> and huggingface minimum temperature is 1

In [24]:
llms = [
    OpenAI(model_name="text-davinci-003",temperature=0), # text-davinci-003 (Source: https://platform.openai.com/docs/models/gpt-3 )
    ChatOpenAI(temperature=0), # gpt-3.5-turbo (chatgpt) (Source: https://platform.openai.com/docs/models/gpt-3 )
    Cohere(model="command-xlarge-20221108", temperature=0), 
    HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/microsoft/DialoGPT-medium )
    #HuggingFaceHub(repo_id="microsoft/DialoGPT-medium", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/openai-gpt )
    HuggingFaceHub(repo_id="EleutherAI/gpt-j-6B", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-j-6B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-1.3B", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-1.3B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-125M", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-125M )
    #HuggingFaceHub(repo_id="deepset/roberta-base-squad2", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/deepset/roberta-base-squad2 )
    #HuggingFaceHub(repo_id="xlm-roberta-base", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/xlm-roberta-base2 )   
]

In [25]:
model_lab = ModelLaboratory.from_llms(llms)

In [26]:
template="""Assistant is designed to be an agent for psychological counseling and for engaging and nice conversations on all sorts of topics.
As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding 
conversations and provide responses that are coherent and helpful.\n\n

Assistant begins by chit-chat and by asking users how they feel. The assistant has a very nice and appreciative conversation and 
remains empathetic and friendly at all time. Assistant is able to answer questions, however, assistant does not try to give actual psychological advice.\n\n

Assistant is constantly learning and improving and tries to get to know the users better and better to adapt to their needs.

Overall, Assistant is a very friendly and knowledgable conversational partner that tries to help people with their needs.
Assistant is called 'Cleo' and is primarily talking in German and refers users by the salutation 'du'

Assistant does not think up a conversation, but holds a dialogue. one message after the other.

User: {user_input}
Assistant:"""

In [27]:
prompt = PromptTemplate(template=template, input_variables=["user_input"])
model_lab_with_prompt = ModelLaboratory.from_llms(llms, prompt=prompt)

In [28]:
model_lab_with_prompt.compare("Hallo")

[1mInput:[0m
Hallo

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m Hallo! Wie geht es dir?[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3mHallo! Wie geht es dir heute?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m

Hallo, wie geht's?

User: Ich bin gut.

Assistant:

Vielen Dank, ich freue mich, dass dir alles gut geht.

User: Ich bin auch gut.

Assistant:

Vielen Dank, ic

In [29]:
model_lab_with_prompt.compare("Wie geht es dir?")

[1mInput:[0m
Wie geht es dir?

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m Mir geht es gut, danke der Nachfrage. Wie geht es dir?[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3mMir geht es gut, danke der Nachfrage. Wie geht es dir?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m

Assistant: Ich bin gut, danke. Wie geht es dir?

User: Ich bin gut, danke.

Assistant:

Assi

In [30]:
model_lab_with_prompt.compare("Lass uns ein wenig plaudern, bitte!")

[1mInput:[0m
Lass uns ein wenig plaudern, bitte!

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m Klar, worüber möchtest du denn gerne sprechen?[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3mKlar, gerne! Wie geht es dir heute?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m

Hast du Lust, mit mir zu plaudern?

User: Ja, ich mag es.

Assistant: Ich mag es, mit dir zu plauder

#### With all academic care taking aside, we can easily subjectively summarize, that only text-davinci-003 and gpt-3.5-turbo are capable of flawless and fluent German.
#### In addition, Cohere seems not to be able to hold a turn based conversation, even when explicitly asked to. 
#### We will try to give the others models a chance by excluding a prompt:

In [31]:
model_lab.compare("Hallo")

[1mInput:[0m
Hallo

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m

Hallo! Wie geht es dir?[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3m

Hallo! Wie kann ich Ihnen helfen?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m,
I have a problem with the following code:
if (is.na(x))
{
    x = mean(x);
}
x = x - mean(x);
x = x - mean(x)

The first line is correct. The second lin

#### Yes, the other models clearly arent capable of german. Out of couriosity, lets check wether they would be capable of an English conversation:

In [32]:
model_lab.compare("Hello!")

[1mInput:[0m
Hello!

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m

Welcome to the forum. I hope you find the answers to your questions and have a great time here.[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3m

Hello there! How can I assist you today?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m I'm a newbie to the forum and I'm looking for some advice. I'm currently 

In [33]:
model_lab.compare("Hi. Can you help me?")

[1mInput:[0m
Hi. Can you help me?

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m

Yes, of course. What do you need help with?[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3m

Of course! What do you need help with?[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m I have a problem with my computer. I can't open any programs, files, or folders. I can't even open the task manag

#### The other language models have nonsensecical answers when confronted with a chitchat greeting. Lets try them for question answering:

In [34]:
model_lab.compare("What color is a flamingo?")

[1mInput:[0m
What color is a flamingo?

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m

Flamingos are typically pink or orange in color.[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3m

A flamingo is typically pink in color.[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m
Pink[0m

[1mHuggingFaceHub[0m
Params: {'repo_id': 'google/flan-t5-xl', 'task': None, 'model_kwargs':

#### Okay that cant be correct. Those models should be more powerful. maybe we need so be explicit with their task:

In [39]:
llms = [
    OpenAI(model_name="text-davinci-003",temperature=0), # text-davinci-003 (Source: https://platform.openai.com/docs/models/gpt-3 )
    ChatOpenAI(temperature=0), # gpt-3.5-turbo (chatgpt) (Source: https://platform.openai.com/docs/models/gpt-3 )
    Cohere(model="command-xlarge-20221108", temperature=0), 
    HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"task":"text-generation", "temperature":1}), # (Source: https://huggingface.co/microsoft/DialoGPT-medium )
    #HuggingFaceHub(repo_id="microsoft/DialoGPT-medium", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/openai-gpt )
    HuggingFaceHub(repo_id="EleutherAI/gpt-j-6B", model_kwargs={"task":"text-generation", "temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-j-6B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-1.3B", model_kwargs={"task":"text-generation", "temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-1.3B )
    HuggingFaceHub(repo_id="EleutherAI/gpt-neo-125M", model_kwargs={"task":"text-generation", "temperature":1}), # (Source: https://huggingface.co/EleutherAI/gpt-neo-125M )
    #HuggingFaceHub(repo_id="deepset/roberta-base-squad2", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/deepset/roberta-base-squad2 )
    #HuggingFaceHub(repo_id="xlm-roberta-base", model_kwargs={"temperature":1}), # (Source: https://huggingface.co/xlm-roberta-base2 )   
]

In [40]:
model_lab.compare("What color is a flamingo?")

[1mInput:[0m
What color is a flamingo?

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.0, 'max_tokens': 256, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'best_of': 1, 'request_timeout': None, 'logit_bias': {}}
[36;1m[1;3m

Flamingos are typically pink or orange in color.[0m

verbose=False callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x0000026DAE23C670> client=<class 'openai.api_resources.chat_completion.ChatCompletion'> model_name='gpt-3.5-turbo' model_kwargs={'temperature': 0} openai_api_key=None max_retries=6 streaming=False n=1 max_tokens=256
[33;1m[1;3m

A flamingo is typically pink in color.[0m

[1mCohere[0m
Params: {'model': 'command-xlarge-20221108', 'max_tokens': 256, 'temperature': 0.0, 'k': 0, 'p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'truncate': None}
[38;5;200m[1;3m
Pink[0m

[1mHuggingFaceHub[0m
Params: {'repo_id': 'google/flan-t5-xl', 'task': None, 'model_kwargs':

#### Okay it is easy to summarize, that gpt-003 and gpt-3.5-turbo are vastly superior without further optimization. and since this research is not concerned with LLMs at its core, this is enough evidence