Step 1a: We need to load in 1500 posts from each platform and pull out the text. We then need to split roughly 80/20 (We will use 1200/300) for test and train data

In [None]:
# Load in the data
from io import StringIO
import json
import sys
import os
import subprocess
import pandas as pd
import numpy as np
import requests

# We need to navigate to preprocessing and run it. We also pull in data_pull as a function
current_directory = os.getcwd()
path_to_add = os.path.abspath(os.path.join(current_directory, '..', '..'))
sys.path.append(path_to_add)
from sample_data import preprocessing, data_pull

target_file_path = os.path.abspath(os.path.join(current_directory, '..', '..', 'sample_data', 'preprocessing.py'))
result = subprocess.run(['python', '../../sample_data/preprocessing.py'], capture_output=True, text=True)

# Check if the script ran successfully
if result.returncode == 0:
    print("Script executed successfully")
else:
    print("Error in script execution")
    

Step 1b: We pull in all our data to get approx. 1500-2000 comments

In [None]:
# Now we run data pull to sample our data
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result

# Twitter
data_pull.data_puller('Twitter', 1500, 1, 'username')

sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)
twitter_data = [{'id': item['id'], 'text': item['text']} for item in data['items']]

# Reddit. We sample a much higher amount as there are far more posts than comments in the sample dataset
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result
data_pull.data_puller('Reddit', 4000, 1, 'username')

sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)

reddit_data = [{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item]
print(len(reddit_data))

# Facebook. We sample fewer than 1500 since each data pull pulls posts and ALL comments on post
old_stdout = sys.stdout
result = StringIO()
sys.stdout = result

data_pull.data_puller('Facebook', 300, 1, 'username')

sys.stdout = old_stdout

result_string = result.getvalue()
result = result_string.rstrip()

data = json.loads(result)
facebook_data = [{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item]

Step 2a: Now that we have our sampled data, we now need to API into GPT to allow it to label the sentiment of our values

In [None]:
!pip install openai
from openai import OpenAI

# Should be "os.environ.get("OPENAI_API_KEY")" but that is not working for some reason
client = OpenAI(api_key='')

response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'You are a helpful assistant that processes text and returns results in JSON format. Reorder the items you are given in terms of their positivity, with the most positive item first, and include your reasoning. Give me a JSON array in the following format: [ {"item_idx": int, "reason": str} ]',
            },
            {
                "role": "user",
                "content": "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n",
            }
        ],
    )

chatgpt_data = response.choices[0].message.content.strip() #same situation here, don't need the ranking immediately

print(chatgpt_data)


In [None]:
# Lets now see if we can run it on our larger posts

response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'You are a helpful assistant that processes text and returns results in JSON format. Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 100). Give me a JSON array in the following format: [ {"item_idx": int, "score": int} ]',
            },
            {
                "role": "user",
                "content": "ITEM 0:\nI love you.\n\nITEM 1:\nI hate you.\n\nITEM 2:\nI am indifferent to you.\nITEM 3:\nI like soup\n\n",
            }
        ],
    )

chatgpt_data = response.choices[0].message.content.strip()

print(chatgpt_data)

Step 2b: Now that the API is set up and works, we can label our data, giving each item a positivity score.

In [None]:
# Processing Twitter Data
def create_batches(data, batch_size):
    """Yield successive n-sized batches from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]


In [None]:

# Example usage:
twitter_data_batches = list(create_batches(twitter_data, 10))  # adjust batch_size based on average token size of entries

print(len(twitter_data_batches))
twitter_results = []
for batch in twitter_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 1) to 2 dp. More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        twitter_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

current_dir = os.getcwd()

# Define the path for the output file
output_file_path = os.path.join(current_dir, 'twitter_results.json')

with open(output_file_path, 'w') as file:
    json.dump(twitter_results, file, indent=4)

print(twitter_results)

In [None]:
# Processing Reddit Data

# Example usage:
reddit_data_batches = list(create_batches(reddit_data, 10))  # adjust batch_size based on average token size of entries

print(reddit_data_batches)
reddit_results = []
for batch in reddit_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 1) to 2 dp. More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        reddit_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

current_dir = os.getcwd()

# Define the path for the output file
output_file_path = os.path.join(current_dir, 'reddit_results.json')

with open(output_file_path, 'w') as file:
    json.dump(reddit_results, file, indent=4)

print(reddit_results)

In [None]:
# Processing Facebook Data

# Example usage:
facebook_data_batches = list(create_batches(facebook_data, 10))  # adjust batch_size based on average token size of entries

facebook_results = []
for batch in facebook_data_batches:
    item_content = ""
    for idx, item in enumerate(batch):
        item_content += f"ITEM {idx}:\n{item['text']}\n\n"

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 1) to 2 dp. More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str} ]',
                },
                {
                    "role": "user",
                    "content": item_content,
                }
            ],
        )
        chatgpt_data = response.choices[0].message.content.strip()
        facebook_results.append(chatgpt_data)
    except Exception as e:
        print(f"Failed to process batch: {str(e)}")

current_dir = os.getcwd()

output_file_path = os.path.join(current_dir, 'facebook_results.json')

with open(output_file_path, 'w') as file:
    json.dump(facebook_results, file, indent=4)

print(facebook_results)

In [None]:
with open('twitter_results.json', 'r') as file:
    twitter_results = json.load(file)
with open('reddit_results.json', 'r') as file:
    reddit_results = json.load(file)
with open('facebook_results.json', 'r') as file:
    facebook_results = json.load(file)

print(twitter_results)
print(reddit_results)
print(facebook_results)

Step 2c: We can now split our data into train and test. We will use a roughly 80/20 split

In [None]:
# We associate our index with our data and then pull the positivity score for each dataset
combined_twitter = []


for index, json_string in enumerate(twitter_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = twitter_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_twitter.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")



# Twitter split
indices = np.arange(len(combined_twitter))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_twitter) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
twitter_train = [combined_twitter[i] for i in train_indices]
twitter_test = [combined_twitter[i] for i in test_indices]

print(len(twitter_train))
print(len(twitter_test))

In [None]:
# We associate our index with our data and then pull the positivity score for each dataset

combined_reddit = []

for index, json_string in enumerate(reddit_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = reddit_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_reddit.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")


# Reddit split
indices = np.arange(len(combined_reddit))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_reddit) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
reddit_train = [combined_reddit[i] for i in train_indices]
reddit_test = [combined_reddit[i] for i in test_indices]

print(len(reddit_train))
print(len(reddit_test))

In [None]:
# We associate our index with our data and then pull the positivity score for each dataset
combined_facebook = []


for index, json_string in enumerate(facebook_results):
    # Strip markdown code block syntax if present
    clean_json_string = json_string.strip('`json\n ')

    # Check if the string is empty after cleaning
    if not clean_json_string or clean_json_string == '[]':
        print(f"Skipping empty or malformed input at index {index}")
        continue

    try:
        # Attempt to parse the JSON string
        items = json.loads(clean_json_string)

        # Process each item in the parsed JSON array
        for item in items:
            item_idx = item['item_idx']
            score = item['score']
            sentiment = item['sentiment']
            data_text = facebook_data[item_idx]['text']  # Fetch the text using item_idx

            # Append the combined data to the list
            combined_facebook.append({
                "text": data_text,
                "score": score,
                "sentiment": sentiment
            })
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON at index {index}: {clean_json_string}")
        print(f"JSON error: {e}")

# Facebook split
indices = np.arange(len(combined_facebook))
np.random.shuffle(indices)

percentage = 0.8
sample_size = int(len(combined_facebook) * (percentage))

train_indices = indices[:sample_size]
test_indices = indices[sample_size:]
facebook_train = [combined_facebook[i] for i in train_indices]
facebook_test = [combined_facebook[i] for i in test_indices]

print(len(facebook_train))
print(len(facebook_test))

Step 3a: Next we need to split our data back into batches for training and define our prompt again

In [None]:
# Batch our data so we don't exceed token limits
twitter_data_batches = list(create_batches(twitter_train, 150))
reddit_data_batches = list(create_batches(reddit_train, 150))
facebook_data_batches = list(create_batches(facebook_train, 150))

Step 3b: We now need to import our mistral model and tokenize our data

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

class SentimentDataset():
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Encoding the text using the tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
label_dict = {
    "very negative": 0,
    "negative": 1,
    "neutral": 2,
    "positive" : 3,
    "very positive" : 4
}

def get_label(text):
    return label_dict[text]

# Twitter data
prompts = [i['text'] for i in twitter_train]
score = [i['score'] for i in twitter_train]
sentiment= [i['sentiment'] for i in twitter_train]
sentiment = [get_label(i) for i in sentiment]

twitter_dataset = SentimentDataset(prompts, score, tokenizer)

prompts = [i['text'] for i in twitter_test]
score = [i['score'] for i in twitter_test]
sentiment= [i['sentiment'] for i in twitter_test]
sentiment = [get_label(i) for i in sentiment]

twitter_test = SentimentDataset(prompts, score, tokenizer)

# Reddit data
prompts = [i['text'] for i in reddit_train]
score = [i['score'] for i in reddit_train]
sentiment= [i['sentiment'] for i in reddit_train]
sentiment = [get_label(i) for i in sentiment]

reddit_dataset = SentimentDataset(prompts, score, tokenizer)

prompts = [i['text'] for i in reddit_test]
score = [i['score'] for i in reddit_test]
sentiment= [i['sentiment'] for i in reddit_test]
sentiment = [get_label(i) for i in sentiment]

reddit_test = SentimentDataset(prompts, score, tokenizer)

# # Facebook data
prompts = [i['text'] for i in facebook_train]
score = [i['score'] for i in facebook_train]
sentiment= [i['sentiment'] for i in facebook_train]
sentiment = [get_label(i) for i in sentiment]

facebook_dataset = SentimentDataset(prompts, score, tokenizer)

prompts = [i['text'] for i in facebook_test]
score = [i['score'] for i in facebook_test]
sentiment= [i['sentiment'] for i in facebook_test]
sentiment = [get_label(i) for i in sentiment]

reddit_test = SentimentDataset(prompts, score, tokenizer)

In [None]:
print(twitter_test)

In [None]:
batch_size = 32
twitter_data_loader = DataLoader(twitter_dataset, batch_size=batch_size, shuffle=True)
reddit_data_loader = DataLoader(reddit_dataset, batch_size=batch_size, shuffle=True)
facebook_data_loader = DataLoader(facebook_dataset, batch_size=batch_size, shuffle=True)

class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Adding a dropout layer for some regularization
        self.dropout = nn.Dropout(0.1)
        # A linear layer to output a single continuous value
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # Extracting the pooled output from BERT's last hidden state
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # Applying dropout
        dropped_output = self.dropout(pooled_output)
        linear_output = self.linear(dropped_output)
        # Applying sigmoid and scaling to 0-100 range
        score = self.sigmoid(linear_output) * 100
        return score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)


In [18]:
for epoch in range(2):
    model.train()
    for batch in twitter_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'twitter_sentiment_model.pth')

KeyboardInterrupt: 

In [None]:

for epoch in range(2):
    model.train()
    for batch in facebook_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'facebook_sentiment_model.pth')

In [None]:

for epoch in range(2):
    model.train()
    for batch in reddit_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'reddit_sentiment_model.pth')

In [None]:
twitter_data_loader = DataLoader(twitter_test, batch_size=batch_size, shuffle=True)
reddit_data_loader = DataLoader(reddit_test, batch_size=batch_size, shuffle=True)
facebook_data_loader = DataLoader(facebook_test, batch_size=batch_size, shuffle=True)


In [None]:
def predict_sentiment(data_loader, model):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    with torch.no_grad():
        for batch in twitter_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Model inference
            outputs = model(input_ids, attention_mask)
            sentiment_scores = outputs.squeeze().tolist()  # Convert to list of scores
            predictions.extend(sentiment_scores)
    return predictions

In [None]:
model = SentimentClassifier().to(device)
model.load_state_dict(torch.load('twitter_sentiment_model.pth', map_location=device))
model.eval()
predictions = predict_sentiment(twitter_test, model)

In [None]:
model = SentimentClassifier().to(device)
model.load_state_dict(torch.load('facebook_sentiment_model.pth', map_location=device))
model.eval()
predictions = predict_sentiment(facebook_test, model)

In [None]:
print(predictions)
print(len(predictions))
print(max(predictions))
print(min(predictions))

In [None]:
def rescale_scores(outputs):
    # Apply sigmoid to ensure the outputs are between 0 and 1
    outputs = torch.sigmoid(torch.Tensor(outputs))
    # Scale to 0-100 range
    return (outputs * 100).tolist()

# Example usage
scaled_predictions = rescale_scores(predictions)
print(scaled_predictions)

