Step 1a: We need to load in 1500 posts from each platform and pull out the text. We then need to split roughly 80/20 (We will use 1200/300) for test and train data

In [1]:
# Load in the data
from io import StringIO
import json
import sys
import os
import subprocess

import numpy as np
import pandas as pd
import requests

from openai import OpenAI

In [2]:
# Create directories if needed
directories = [
    'facebook_data/processed',
    'reddit_data/processed',
    'twitter_data/processed'
]
for directory in directories:
    os.makedirs(directory, exist_ok=True)

In [3]:

# We need to navigate to preprocessing and run it. We also pull in data_pull as a function
current_directory = os.getcwd()
path_to_add = os.path.abspath(os.path.join(current_directory, '..', '..'))
sys.path.append(path_to_add)
from sample_data import preprocessing, data_pull

target_file_path = os.path.abspath(os.path.join(current_directory, '..', '..', 'sample_data', 'preprocessing.py'))
result = subprocess.run(['python', '../../sample_data/preprocessing.py'], capture_output=True, text=True)

# Check if the script ran successfully
if result.returncode == 0:
    print("Script executed successfully")
else:
    print("Error in script execution")

2024-05-22 20:30:30 [INFO] Starting preprocessing
2024-05-22 20:30:31 [INFO] Writing facebook_data/processed/normalized_posts_facebook.json
Processing Facebook posts: 100%|██████████| 18368/18368 [00:07<00:00, 2497.64it/s]
2024-05-22 20:30:40 [INFO] Writing reddit_data/processed/normalized_posts_reddit.json
Processing Reddit posts: 100%|██████████| 52235/52235 [01:06<00:00, 785.16it/s]
2024-05-22 20:31:48 [INFO] Writing twitter_data/processed/normalized_posts_twitter.json
Processing Twitter posts: 100%|██████████| 12507/12507 [00:01<00:00, 7269.17it/s]
2024-05-22 20:31:58 [INFO] Finished preprocessing


Script executed successfully


Step 1b: We pull in all our data to get approx. 1500-2000 comments

In [4]:
# Now we run data pull to sample our data

platforms = ['twitter', 'reddit', 'facebook']
platform_data = []

for platform in platforms:
    
    old_stdout = sys.stdout
    result = StringIO()
    sys.stdout = result

    # Pull data and append to list
    data_pull.random_user_feed_generator(platform, 1500, 1, 'username')
    sys.stdout = old_stdout
    result_string = result.getvalue()
    result = result_string.rstrip()
    data = json.loads(result)
    platform_data.append([{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item])

twitter =  platform_data[0]
reddit =  platform_data[1]
facebook =  platform_data[2]

Step 2a: Data has already been labelled but if you would like to relabel, you can run the following cells. Otherwise just pull in whatever platform you want data for

In [None]:
# Set your key here if you wish to train and label the data
client = OpenAI(api_key='')

In [None]:
# OpenAI has character limits so we will batch data
def create_batches(data, batch_size):
    """Yield successive n-sized batches from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

Step 2b: Now that the API is set up and works, we can label our data, giving each item a positivity score.

In [None]:
def labelling(dataset, platform):
    data_batches = create_batches(dataset, 10) # Adjust batch size if needed
    results = []
    for batch in data_batches:
        item_content = ""
        for idx, item in enumerate(batch):
            item_content += f"ITEM {idx}:\n{item['text']}\n\n"

        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 1) to 2 dp. More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str, "text": str} ]',
                    },
                    {
                        "role": "user",
                        "content": item_content,
                    }
                ],
            )
            chatgpt_data = response.choices[0].message.content.strip()
            results.append(chatgpt_data)
        except Exception as e:
            print(f"Failed to process batch: {str(e)}")
            
    current_dir = os.getcwd()

    # Define the path for the output file
    output_file_path = os.path.join(current_dir, f'{platform}_results.json')

    with open(output_file_path, 'w') as file:
        json.dump(results, file, indent=4)

In [None]:
# This will take awhile to run so only run if you wish to relabel data
# labelling(twitter, 'twitter')
# labelling(reddit, 'reddit')
# labelling(facebook, 'facebook')

In [None]:
with open('twitter_results.json', 'r') as file:
    labelled_twitter = json.load(file)
with open('reddit_results.json', 'r') as file:
    labelled_reddit = json.load(file)
with open('facebook_results.json', 'r') as file:
    labelled_facebook = json.load(file)

Step 2c: We can now split our data into train and test. We will use a roughly 80/20 split

In [None]:
def train_test_split(dataset, split):
    combined = []
    for index, json_string in enumerate(dataset):
        # Strip markdown code block syntax if present
        clean_json_string = json_string.strip('`json\n ')
        # Check if string is empty after cleaning
        if not clean_json_string or clean_json_string == '[]':
            print(f"Skipping empty or malformed input at index {index}")
            continue

        try:
            # Attempt to parse
            items = json.loads(clean_json_string)

            # Process each item in the parsed JSON array
            for item in items:
                item_idx = item['item_idx']
                score = item['score']
                sentiment = item['sentiment']
                data_text = item['text'] 

                # Append the combined data to the list
                combined.append({
                    "text": data_text,
                    "score": score,
                    "sentiment": sentiment
                })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON at index {index}: {clean_json_string}")
            print(f"JSON error: {e}")

    indices = np.arange(len(combined))
    np.random.shuffle(indices)

    sample_size = int(len(dataset) * (split))

    train_indices = indices[:sample_size]
    test_indices = indices[sample_size:]
    train = [combined[i] for i in train_indices]
    test = [combined[i] for i in test_indices]

    return train, test

In [None]:
twitter_train, twitter_test = train_test_split(labelled_twitter, 0.8)
reddit_train, reddit_test = train_test_split(labelled_reddit, 0.8)
facebook_train, facebook_test = train_test_split(labelled_facebook, 0.8)

Step 3a: Next we need to split our data back into batches for training and define our prompt again

In [None]:
# Batch our data so we don't exceed token limits
twitter_data_batches = list(create_batches(twitter_train, 150))
reddit_data_batches = list(create_batches(reddit_train, 150))
facebook_data_batches = list(create_batches(facebook_train, 150))

Step 3b: We now need to import our mistral model and tokenize our data

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader

# Regular Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# BERT Large - Performs better but will be slower
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# model = BertModel.from_pretrained('bert-large-uncased')


class SentimentDataset():
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
label_dict = {
    "very negative": 0,
    "negative": 1,
    "neutral": 2,
    "positive" : 3,
    "very positive" : 4
}

bert_train = []
bert_test = []

def get_label(text):
    return label_dict[text]

def prep_data(dataset):
    prompts = [i['text'] for i in dataset]
    score = [i['score'] for i in dataset]
    sentiment= [i['sentiment'] for i in dataset]
    sentiment = [get_label(i) for i in sentiment]

    prepped_data = SentimentDataset(prompts, score, tokenizer)
    
    return prepped_data

twitter_bert_train = prep_data(twitter_train)
twitter_bert_test = prep_data(twitter_test)
reddit_bert_train = prep_data(reddit_train)
reddit_bert_test = prep_data(reddit_test)
facebook_bert_train = prep_data(facebook_train)
facebook_bert_test = prep_data(facebook_test)


In [None]:
batch_size = 32
twitter_data_loader = DataLoader(twitter_bert_train, batch_size=batch_size, shuffle=True)
reddit_data_loader = DataLoader(reddit_bert_train, batch_size=batch_size, shuffle=True)
facebook_data_loader = DataLoader(facebook_bert_train, batch_size=batch_size, shuffle=True)

class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Adding a dropout layer for some regularization
        self.dropout = nn.Dropout(0.1)
        # A linear layer to output a single continuous value
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # Extracting the pooled output from BERT's last hidden state
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # Applying dropout
        dropped_output = self.dropout(pooled_output)
        linear_output = self.linear(dropped_output)
        # Applying sigmoid and scaling to 0-100 range
        score = self.sigmoid(linear_output) * 100
        return score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # May need to specify specific GPU if memory already largely in use
model = SentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [18]:
for epoch in range(5):
    model.train()
    for batch in twitter_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'twitter_sentiment_model.pth')

KeyboardInterrupt: 

In [None]:
for epoch in range(5):
    model.train()
    for batch in facebook_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'facebook_sentiment_model.pth')

In [None]:
for epoch in range(5):
    model.train()
    for batch in reddit_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.float)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'reddit_sentiment_model.pth')

In [None]:
twitter_test_data_loader = DataLoader(twitter_bert_test, batch_size=batch_size, shuffle=True)
reddit_test_data_loader = DataLoader(reddit_bert_test, batch_size=batch_size, shuffle=True)
facebook_test_data_loader = DataLoader(facebook_bert_test, batch_size=batch_size, shuffle=True)

In [None]:
def predict_sentiment(data_loader, model):
    model.eval() 
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Model inference
            outputs = model(input_ids, attention_mask)
            sentiment_scores = outputs.squeeze().tolist()  
            predictions.extend(sentiment_scores)
    return predictions

In [None]:
model_twit = SentimentClassifier().to(device)
model_twit.load_state_dict(torch.load('twitter_sentiment_model.pth', map_location=device))
model_twit.eval()
twitter_predictions = predict_sentiment(twitter_test_data_loader, model_twit)

In [None]:
model_facebook = SentimentClassifier().to(device)
model_facebook.load_state_dict(torch.load('facebook_sentiment_model.pth', map_location=device))
model_facebook.eval()
facebook_predictions = predict_sentiment(facebook_test_data_loader, model_facebook)

In [None]:
model_reddit = SentimentClassifier().to(device)
model_reddit.load_state_dict(torch.load('reddit_sentiment_model.pth', map_location=device))
model_reddit.eval()
reddit_predictions = predict_sentiment(reddit_test_data_loader, model_reddit)

Step 4: We now can compare rescaled outputs

In [None]:
def rescale_scores(outputs):
    outputs = torch.Tensor(outputs)  # Convert outputs to a PyTorch tensor if not already

    # Find the minimum and maximum values
    min_val = torch.min(outputs)
    max_val = torch.max(outputs)

    # Normalize the outputs to a 0-100 scale
    normalized_outputs = (outputs - min_val) / (max_val - min_val)

    return normalized_outputs.tolist()

scaled_predictions_twit = rescale_scores(twitter_predictions)
scaled_predictions_redd = rescale_scores(reddit_predictions)
scaled_predictions_face= rescale_scores(facebook_predictions)

twitter_scores = [twitter_test[i]['score'] for i in range(len(twitter_test))]
reddit_scores = [reddit_test[i]['score'] for i in range(len(reddit_test))]
facebook_scores = [facebook_test[i]['score'] for i in range(len(facebook_test))]

In [None]:
# optional MSE calc
import numpy as np

def mean_squared_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    squared_diff = (y_true - y_pred) ** 2
    
    mse = np.mean(squared_diff)
    return mse

print(mean_squared_error(twitter_scores, scaled_predictions_twit))
print(mean_squared_error(reddit_scores, scaled_predictions_redd))
print(mean_squared_error(facebook_scores, scaled_predictions_face))