In [None]:
import cohere
co = cohere.ClientV2("api_key")

In [4]:
import os
import json
import jsonlines

#### dataset used - 
https://huggingface.co/datasets/grammarly/coedit?ref=cohere-ai.ghost.io

#### eg:
{
 "_id": "57241", 
 "task": "coherence", 
 "src": "Make the text more coherent: It lasted for 60 minutes. It featured the three men taking questions from a studio audience.", 
 "tgt": "Lasting for 60 minutes, it featured the three men taking questions from a studio audience."
}

{
 "_id": "69028", 
 "task": "clarity", 
 "src": "Make the sentence clearer: URLe Lilanga (1934 27 June 2005) was a Tanzanian painter and sculptor, active from the late 1970s and until the early years of the 21st century.", 
 "tgt": "URLe Lilanga (1934 27 June 2005) was a Tanzanian painter and sculptor, active from the late 1970s and until the early 21st century."
}



In [9]:
# Download the dataset
from datasets import load_dataset

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [10]:
dataset = load_dataset("grammarly/coedit", split="train")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 69071/69071 [00:00<00:00, 139351.75 examples/s]
Generating validation split: 100%|██████████| 1712/1712 [00:00<00:00, 171208.33 examples/s]


In [11]:
dataset

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 69071
})

In [13]:
# Assuming 'dataset' is your loaded dataset object
phrase = "coherent"  # Replace with the actual phrase you're searching for

# Use the filter method to create a subset
filtered_dataset = dataset.filter(lambda example: phrase in example['src'].split(":")[0])

# Convert to a list if you still need it as list of dictionaries
dataset_list = [example for example in filtered_dataset]
# Split data into training and test
dataset_list_train = dataset_list[:800]
dataset_list_test = dataset_list[800:]

print("Total number of examples:", len(dataset_list))
print("Number of examples in training set:", len(dataset_list_train))
print("Number of examples in the test set:", len(dataset_list_test))

Filter: 100%|██████████| 69071/69071 [00:00<00:00, 75001.62 examples/s] 

Total number of examples: 927
Number of examples in training set: 800
Number of examples in the test set: 127





In [15]:
for item in dataset_list_train[:10]:
    print(item["src"])
    print(item["tgt"])
    print("-"*50)

Make the text coherent: The Bank's main strategy is to further expand its network and increase its lending activities with particular focus on the SME sector. The EBRD helps Bank, by developing and financing Bank's portfolio of and strengthening the bank's funding base.
The Bank's main strategy is to further expand its network and increase its lending activities with particular focus on the SME sector. The EBRD helps Union Bank, by developing and financing its portfolio of and strengthening the bank's funding base.
--------------------------------------------------
Make the text coherent: It was not illegal under international law ; captured foreign sailors were released. Confederates went to prison camps.
It was not illegal under international law ; captured foreign sailors were released, while Confederates went to prison camps.
--------------------------------------------------
Make the text coherent: The Union blockade was a powerful weapon that eventually ruined the Southern econom

In [16]:
# arranges the data to suit Cohere's format
def create_chat_ft_data(system_message, user_message, chatbot_message):
    formatted_data = {
        "messages": [
            {
                "role": "System",
                "content": system_message
            },
            {
                "role": "User",
                "content": user_message
            },
            {
                "role": "Chatbot",
                "content": chatbot_message
            }
        ]
    }

    return formatted_data

system_message = "You are a writing assistant that helps the user write coherent text."

# creates jsonl file from list of examples
def create_jsonl_from_list(file_name, dataset_segment, system_message):
    path = f'{file_name}.jsonl'
    if not os.path.isfile(path):
        with open(path, 'w+') as file:
            for item in dataset_segment:
                user_message = item["src"]
                chatbot_message = item["tgt"]
                formatted_data = create_chat_ft_data(system_message, user_message, chatbot_message)
                file.write(json.dumps(formatted_data) + '\n')
            file.close()

# Create training jsonl file
file_name = "coedit_coherence_train"
create_jsonl_from_list(file_name, dataset_list_train, system_message)

# List the first 3 items in the JSONL file
with jsonlines.open(f'{file_name}.jsonl') as f:
    [print(line) for _, line in zip(range(3), f)]

{'messages': [{'role': 'System', 'content': 'You are a writing assistant that helps the user write coherent text.'}, {'role': 'User', 'content': "Make the text coherent: The Bank's main strategy is to further expand its network and increase its lending activities with particular focus on the SME sector. The EBRD helps Bank, by developing and financing Bank's portfolio of and strengthening the bank's funding base."}, {'role': 'Chatbot', 'content': "The Bank's main strategy is to further expand its network and increase its lending activities with particular focus on the SME sector. The EBRD helps Union Bank, by developing and financing its portfolio of and strengthening the bank's funding base."}]}
{'messages': [{'role': 'System', 'content': 'You are a writing assistant that helps the user write coherent text.'}, {'role': 'User', 'content': 'Make the text coherent: It was not illegal under international law ; captured foreign sailors were released. Confederates went to prison camps.'}, {

In [None]:
for item in dataset_list_test[:1]:
    # User prompt
    user_message = item["src"]
    # Desired/target response from dataset
    tgt_message = item["tgt"]
    system_message = "You are a writing assistant that helps the user write coherent text."


    # Get default model response
    response_pretrained=co.chat(
        model="command-r-plus",
        messages=[cohere.UserMessage(content=system_message),
                  cohere.UserMessage(content=user_message)],
        )

    # Get fine-tuned model response
    response_finetuned = co.chat(
        model="4708865e-3870-42bf-99fa-ffe84e81fd5f-ft",
        messages=[cohere.UserMessage(content=system_message),
                  cohere.UserMessage(content=user_message)],
        
        )

    print(f"User: {user_message}","\n-----")
    print(f"Desired response: {tgt_message}","\n-----")
    print(f"Default model's response: {response_pretrained.message.content[0]['text']}","\n-----")
    print(f"Fine-tuned model's response: {response_finetuned.message.content[0]['text']}")


    print("-"*100,"\n\n")

In [18]:
model = "4708865e-3870-42bf-99fa-ffe84e81fd5f-ft"

def run_chat(user_message, messages=[]):

    messages = messages

    if not any(m.role == 'system' for m in messages):
        messages.append(cohere.SystemMessage(content=system_message))
        
    # Generate response
    response = co.chat(model=model,
                       messages=[cohere.UserMessage(content=user_message)])
    
    print(response.message.content[0]['text'])
    
    # Append the turn to the chat history
    messages.extend([cohere.UserMessage(content=user_message),
                     response.message])
    
    return messages

In [None]:
messages = run_chat("Hello")

messages = run_chat("I'm fine. Can I ask you for help with some tasks?", messages)
