Step 1: Get the data loaded. For this example, we are using 1000 samples from the twitter data set.

In [None]:
from io import StringIO  # Python3
import json
import sys
 
#Data pull gives the data as stdout, so running the command and extracting from there 
 
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result
 
%%run ../../sample_data/preprocessing.py #run preprocessing script 
%cd ../.. #cd-ing to the main dir to access sample_data module  
from sample_data import preprocessing, data_pull

data_pull.data_puller('Twitter', 1000, 1, 'username')
 
sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)

<class 'str'>
<class 'str'>


Step 2: Now, we create a Mistral large model that categorizes the sentiment of a post's text component. 
The output will function as our training set for the new model and is stored in json form, with some human-evaluated examples included for the model benefit. 

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# Replace with your own Mistral API Key
api_key = os.getenv("MISTRAL_API_KEY")

In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

model = "mistral-large-latest"


client = MistralClient(api_key=api_key)


#TODO: not sure if we should be ranking here? feels like the same problem as before 
prompt = f"""You are a helpful assistant that processes text and returns results in JSON format. 
Assign each piece of item with one of the following categorizations, based off of the sentiment of their text: very negative, negative, neutral, positive, very positive.
For each item, give output as a JSON array with the item index coming first and the categorization coming second, 
like so: [{"item_idx": int, "sentiment": str}].
"""

post_text = ""
for i, item in enumerate(data):
        post_text += f"ITEM: {i}:\n{item['text']}\n\n"

#something still feels off here--not sure this is right 

#this func is based off of implementation found here: 
#https://www.datacamp.com/tutorial/guide-to-working-with-the-mistral-large-model
        
def chat_mistral(prompt: str):
   messages = [  
       ChatMessage(role="user", content = prompt), 
       ChatMessage(role="user", content = "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n"),
       ChatMessage(role="assistant", content = '[ {"item_idx": 0, "sentiment": "very positive"}, {"item_idx": 3, "sentiment": "positive"}, {"item_idx": 2, "sentiment": "neutral"}, {"item_idx": 1, "sentiment": "negative"} ]')
       ChatMessage(role="user", content = post_text)
   ]

   # No streaming
   chat_response = client.chat(
       model=model,
       messages=messages,
   )

   return chat_response.choices[0].message.content.strip()

mistral_data = chat_mistral(prompt) #not isolating ranking here because we primarily want to check if the sentiment 
#analysis is comparable, ranking will be done the same way across 5 categories regardless 


#json_results = response.choices[0].message.content.strip()

Step 3: With the output of our initial results, let's train a new mistral model.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data for training Mistral model
# Assuming you have human-labeled examples in the format (prompt, response, sentiment_label)
training_data = [
    ("I love you.", "very positive", 1),  # Positive example
    ("I hate you.", "very negative", 0),  # Negative example
    ("I am indifferent to you.", "neutral", 2),  # Neutral example
    ("I like soup.", "positive", 1)  # Positive example
]

# Tokenize the prompts and responses
prompts, responses, labels = zip(*training_data)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(prompts + responses)
prompt_sequences = tokenizer.texts_to_sequences(prompts)
response_sequences = tokenizer.texts_to_sequences(responses)
max_prompt_length = max(len(seq) for seq in prompt_sequences)
max_response_length = max(len(seq) for seq in response_sequences)
padded_prompt_sequences = pad_sequences(prompt_sequences, maxlen=max_prompt_length)
padded_response_sequences = pad_sequences(response_sequences, maxlen=max_response_length)
labels = np.array(labels)

model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_prompt_length),
    LSTM(128),
    Dense(5, activation='softmax')  # 5 is based on the 5 possible sentiment classifications 
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the Mistral model
model.fit([padded_prompt_sequences, padded_response_sequences], labels, epochs=10, batch_size=1)

# Save the trained Mistral model
model.save("mistral_model.h5")

Step 4: But, we can do better. Let's use a similar structure with our simpler ChatGPT model to create a dataset that can finetune the Mistral model.

In [None]:
from flask import Flask, jsonify, request
from flask_cors import CORS
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'You are a helpful assistant that processes text and returns results in JSON format. Reorder the items you are given in terms of their positivity, with the most positive item first, and include your reasoning. Give me a JSON array in the following format: [ {"item_idx": int, "reason": str} ]',
            },
            {
                "role": "user",
                "content": "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n",
            },
            {
                "role": "assistant",
                "content": '[ {"item_idx": 0, "reason": "The statement is very positive."}, {"item_idx": 3, "reason": "The statement is somewhat positive."}, {"item_idx": 2, "reason": "The statement is neutral."}, {"item_idx": 1, "reason": "The statement is negative."} ]',
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )

chatgpt_data = response.choices[0].message.content.strip() #same situation here, don't need the ranking immediately 
