Step 1a: We need to load in 1500 posts from each platform and pull out the text. We then need to split roughly 80/20 (We will use 1200/300) for test and train data

In [1]:
# Load in the data
from io import StringIO
import json
import sys
import os
import subprocess
import pandas as pd
import numpy as np
import requests

# We need to navigate to preprocessing and run it. We also pull in data_pull as a function
current_directory = os.getcwd()
path_to_add = os.path.abspath(os.path.join(current_directory, '..', '..'))
sys.path.append(path_to_add)
from sample_data import preprocessing, data_pull

target_file_path = os.path.abspath(os.path.join(current_directory, '..', '..', 'sample_data', 'preprocessing.py'))
result = subprocess.run(['python', '../../sample_data/preprocessing.py'], capture_output=True, text=True)

# Check if the script ran successfully
if result.returncode == 0:
    print("Script executed successfully")
else:
    print("Error in script execution")
    


Script executed successfully


Step 1b: We pull in all our data to get approx. 1500-2000 comments

In [2]:
# Now we run data pull to sample our data
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result

# Twitter
data_pull.data_puller('Twitter', 1500, 1, 'username')
 
sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)
twitter_data = [{'id': item['id'], 'text': item['text']} for item in data['items']]

# Reddit. We sample a much higher amount as there are far more posts than comments in the sample dataset
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result
data_pull.data_puller('Reddit', 4000, 1, 'username')
 
sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)

reddit_data = [{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item]
print(len(reddit_data))

# Facebook. We sample fewer than 1500 since each data pull pulls posts and ALL comments on post
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result

data_pull.data_puller('Facebook', 300, 1, 'username')
 
sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)
facebook_data = [{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item]

1612


In [3]:
print(len(reddit_data))
print(len(facebook_data))
print(len(twitter_data))

1612
1940
1500


Step 2a: Now that we have our sampled data, we now need to API into GPT to allow it to label the sentiment of our values

In [4]:
from flask import Flask, jsonify, request
from flask_cors import CORS
from openai import OpenAI

# Should be "os.environ.get("OPENAI_API_KEY")" but that is not working for some reason
client = OpenAI(api_key='')

response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'You are a helpful assistant that processes text and returns results in JSON format. Reorder the items you are given in terms of their positivity, with the most positive item first, and include your reasoning. Give me a JSON array in the following format: [ {"item_idx": int, "reason": str} ]',
            },
            {
                "role": "user",
                "content": "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n",
            }
        ],
    )

chatgpt_data = response.choices[0].message.content.strip() #same situation here, don't need the ranking immediately 

print(chatgpt_data)


[
    {"item_idx": 3, "reason": "Positive sentiment: 'like' towards soup."},
    {"item_idx": 0, "reason": "Positive sentiment: 'love' towards the recipient."},
    {"item_idx": 2, "reason": "Neutral sentiment: 'indifferent' towards the recipient."},
    {"item_idx": 1, "reason": "Negative sentiment: 'hate' towards the recipient."}
]


In [5]:
# Lets now see if we can run it on our larger posts

response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'You are a helpful assistant that processes text and returns results in JSON format. Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). Give me a JSON array in the following format: [ {"item_idx": int, "score": int} ]',
            },
            {
                "role": "user",
                "content": "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n",
            }
        ],
    )

chatgpt_data = response.choices[0].message.content.strip()

print(chatgpt_data)

[
    {"item_idx": 3, "score": 80},
    {"item_idx": 0, "score": 75},
    {"item_idx": 2, "score": 50},
    {"item_idx": 1, "score": 20}
]


Step 2b: Now that the API is set up and works, we can label our data, giving each item a positivity score.

In [6]:
# Processing Twitter Data
def create_batches(data, batch_size):
    """Yield successive n-sized batches from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Example usage:
twitter_data_batches = list(create_batches(twitter_data, 10))  # adjust batch_size based on average token size of entries

print(len(twitter_data_batches))
twitter_results = []
for batch in twitter_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        twitter_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

print(twitter_results)

150
['[\n    {"item_idx": 1, "score": 80, "sentiment": "very positive"},\n    {"item_idx": 0, "score": 75, "sentiment": "very positive"},\n    {"item_idx": 9, "score": 70, "sentiment": "positive"},\n    {"item_idx": 5, "score": 50, "sentiment": "neutral"},\n    {"item_idx": 7, "score": 50, "sentiment": "neutral"},\n    {"item_idx": 8, "score": 45, "sentiment": "neutral"},\n    {"item_idx": 6, "score": 40, "sentiment": "neutral"},\n    {"item_idx": 2, "score": 25, "sentiment": "negative"},\n    {"item_idx": 4, "score": 20, "sentiment": "negative"},\n    {"item_idx": 3, "score": 15, "sentiment": "negative"}\n]', '[\n    {"item_idx": 5, "score": 90, "sentiment": "very positive"},\n    {"item_idx": 6, "score": 80, "sentiment": "positive"},\n    {"item_idx": 3, "score": 70, "sentiment": "positive"},\n    {"item_idx": 2, "score": 60, "sentiment": "positive"},\n    {"item_idx": 7, "score": 50, "sentiment": "neutral"},\n    {"item_idx": 8, "score": 40, "sentiment": "neutral"},\n    {"item_idx"

In [7]:
# Processing Reddit Data

# Example usage:
reddit_data_batches = list(create_batches(reddit_data, 10))  # adjust batch_size based on average token size of entries

print(reddit_data_batches)
reddit_results = []
for batch in reddit_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        reddit_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

print(reddit_results)

['[\n    {"item_idx": 4, "score": 90, "sentiment": "very positive"},\n    {"item_idx": 9, "score": 70, "sentiment": "positive"},\n    {"item_idx": 5, "score": 50, "sentiment": "neutral"},\n    {"item_idx": 8, "score": 40, "sentiment": "neutral"},\n    {"item_idx": 0, "score": 20, "sentiment": "negative"},\n    {"item_idx": 3, "score": 20, "sentiment": "negative"},\n    {"item_idx": 7, "score": 10, "sentiment": "negative"},\n    {"item_idx": 6, "score": 10, "sentiment": "negative"},\n    {"item_idx": 1, "score": 0, "sentiment": "neutral"}\n]', '[\n    {"item_idx": 2, "score": 90, "sentiment": "very positive"},\n    {"item_idx": 8, "score": 60, "sentiment": "positive"},\n    {"item_idx": 3, "score": 45, "sentiment": "neutral"},\n    {"item_idx": 1, "score": 40, "sentiment": "neutral"},\n    {"item_idx": 0, "score": 35, "sentiment": "neutral"},\n    {"item_idx": 4, "score": 30, "sentiment": "neutral"},\n    {"item_idx": 5, "score": 25, "sentiment": "neutral"},\n    {"item_idx": 7, "score"

In [8]:
# Processing Facebook Data

# Example usage:
facebook_data_batches = list(create_batches(facebook_data, 10))  # adjust batch_size based on average token size of entries

facebook_results = []
for batch in facebook_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        facebook_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

print(facebook_results)

['[\n    {"item_idx": 1, "score": 90, "sentiment": "very positive"},\n    {"item_idx": 0, "score": 80, "sentiment": "positive"},\n    {"item_idx": 7, "score": 70, "sentiment": "positive"},\n    {"item_idx": 5, "score": 60, "sentiment": "negative"},\n    {"item_idx": 6, "score": 40, "sentiment": "negative"},\n    {"item_idx": 8, "score": 30, "sentiment": "negative"},\n    {"item_idx": 9, "score": 20, "sentiment": "negative"},\n    {"item_idx": 2, "score": 10, "sentiment": "negative"},\n    {"item_idx": 3, "score": 5, "sentiment": "negative"},\n    {"item_idx": 4, "score": 2, "sentiment": "very negative"}\n]', '[\n    {"item_idx": 4, "score": 90, "sentiment": "very positive"},\n    {"item_idx": 9, "score": 80, "sentiment": "positive"},\n    {"item_idx": 5, "score": 60, "sentiment": "neutral"},\n    {"item_idx": 3, "score": 50, "sentiment": "neutral"},\n    {"item_idx": 7, "score": 40, "sentiment": "neutral"},\n    {"item_idx": 0, "score": 10, "sentiment": "negative"},\n    {"item_idx": 1

In [9]:
print()
print(len(twitter_results))
print(len(reddit_results))
print(len(facebook_results))


150
162
194


Step 2c: We can now split our data into train and test. We will use a roughly 80/20 split 

In [10]:
# We associate our index with our data and then pull the positivity score for each dataset
combined_twitter = []


for index, json_string in enumerate(twitter_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = twitter_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_twitter.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")



# Twitter split
indices = np.arange(len(combined_twitter))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_twitter) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
twitter_train = [combined_twitter[i] for i in train_indices]
twitter_test = [combined_twitter[i] for i in test_indices]

print(len(twitter_train))
print(len(twitter_test))

1196
300


In [11]:
# We associate our index with our data and then pull the positivity score for each dataset

combined_reddit = []

for index, json_string in enumerate(reddit_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = reddit_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_reddit.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")


# Reddit split
indices = np.arange(len(combined_reddit))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_reddit) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
reddit_train = [combined_reddit[i] for i in train_indices]
reddit_test = [combined_reddit[i] for i in test_indices]

print(len(reddit_train))
print(len(reddit_test))

1236
310


In [12]:
# We associate our index with our data and then pull the positivity score for each dataset
combined_facebook = []


for index, json_string in enumerate(facebook_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = facebook_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_facebook.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")


# Facebook split
indices = np.arange(len(combined_facebook))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_facebook) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
facebook_train = [combined_facebook[i] for i in train_indices]
facebook_test = [combined_facebook[i] for i in test_indices]

print(len(facebook_train))
print(len(facebook_test))

1544
386


Step 3a: Next we need to split our data back into batches for training and define our prompt again

In [13]:
from mistralai.client import MistralClient
# To train the model we need to take our scores from the GPT model and add it to our training data

# Replace with your own key
model = "open-mistral-7b"
api_key = ''
client = MistralClient(api_key=api_key)

# We will use the exact same prompt
prompt = '''You are a helpful assistant that processes text and returns results in JSON format. 
Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). 
More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. 
Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]'''

# Batch our data so we don't exceed token limits
twitter_data_batches = list(create_batches(twitter_train, 150))
reddit_data_batches = list(create_batches(reddit_train, 150))
facebook_data_batches = list(create_batches(facebook_train, 150))



Step 3b: We now need to import our mistral model and tokenize our data

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the Mistral-7B model and tokenizer

access_token = ''
model_name = "mistralai/Mistral-7B-v0.1"
model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=access_token,num_labels=5)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=access_token)

class SentimentDataset():
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Encoding the text using the tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [02:08<00:00, 64.37s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Twitter data
prompts = [i['text'] for i in twitter_train]
score = [i['score'] for i in twitter_train]
sentiment= [i['sentiment'] for i in twitter_train]
print(twitter_train)
print(prompts)
print(score)
print(sentiment)

twitter_mistral_dataset = SentimentDataset(prompts, sentiment, tokenizer)

# Reddit data
prompts = [i['text'] for i in reddit_train]
score = [i['score'] for i in reddit_train]
sentiment= [i['sentiment'] for i in reddit_train]
print(prompts)
print(score)
print(sentiment)

reddit_mistral_dataset = SentimentDataset(prompts, sentiment, tokenizer)

# Facebook data
prompts = [i['text'] for i in facebook_train]
score = [i['score'] for i in facebook_train]
sentiment= [i['sentiment'] for i in facebook_train]
print(prompts)
print(score)
print(sentiment)

facebook_mistral_dataset = SentimentDataset(prompts, sentiment, tokenizer)

[{'text': '230101 - wonmi update 💕\n\nGak bisa masukin foto :(', 'score': 85, 'sentiment': 'very positive'}, {'text': "@_moth__man_ Please note that the Freedom Convoy doesn't support a blackface-wearing hypocrite. As you apparently do.", 'score': 40, 'sentiment': 'positive'}, {'text': '馳浩がXはさすがに誰も予想つかんかったやろな', 'score': 45, 'sentiment': 'positive'}, {'text': '馳浩がXはさすがに誰も予想つかんかったやろな', 'score': 85, 'sentiment': 'very positive'}, {'text': '@HankMeyerNAPP 👍🏻👍🏻👍🏻', 'score': 70, 'sentiment': 'positive'}, {'text': '230101 - wonmi update 💕\n\nGak bisa masukin foto :(', 'score': 60, 'sentiment': 'positive'}, {'text': "@_moth__man_ Please note that the Freedom Convoy doesn't support a blackface-wearing hypocrite. As you apparently do.", 'score': 70, 'sentiment': 'positive'}, {'text': '230101 - wonmi update 💕\n\nGak bisa masukin foto :(', 'score': 15, 'sentiment': 'negative'}, {'text': 'belangrijk&lt; ander&lt;\n\nTopkapı+\n\nBeşiktaş+\n\nKağıthane+\n\nEsenler+\n\nCevizlibağ+\n\nKaraköy +\n\nSult

In [27]:
from transformers import TrainingArguments, Trainer

import torch
# now we train
from torch.profiler import profile, record_function, ProfilerActivity

# with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
#     with record_function("model_inference"):
#         # Your model inference here
#         predictions = model(input_ids)

# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

# device = torch.device('mps')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=twitter_mistral_dataset,  # Assuming you have a valid train dataset
    # eval_dataset=test_dataset,  # Optionally, provide a test dataset
)

# Train the model
trainer.train()


RuntimeError: MPS backend out of memory (MPS allocated: 9.04 GB, other allocations: 784.00 KB, max allowed: 9.07 GB). Tried to allocate 64.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
from transformers import TrainingArguments, Trainer