Step 1a: We need to load in 1500 posts from each platform and pull out the text. We then need to split roughly 80/20 (We will use 1200/300) for test and train data

In [1]:
# Load in the data
from io import StringIO
import json
import sys
import os
import subprocess

import numpy as np
import pandas as pd
import requests

from openai import OpenAI

In [2]:
# Create directories if needed
directories = [
    'facebook_data/processed',
    'reddit_data/processed',
    'twitter_data/processed'
]
for directory in directories:
    os.makedirs(directory, exist_ok=True)

In [3]:

# We need to navigate to preprocessing and run it. We also pull in data_pull as a function
current_directory = os.getcwd()
path_to_add = os.path.abspath(os.path.join(current_directory, '..', '..'))
sys.path.append(path_to_add)
from sample_data import preprocessing, data_pull

target_file_path = os.path.abspath(os.path.join(current_directory, '..', '..', 'sample_data', 'preprocessing.py'))
result = subprocess.run(['python', '../../sample_data/preprocessing.py'], capture_output=True, text=True)

# Check if the script ran successfully
if result.returncode == 0:
    print("Script executed successfully")
else:
    print("Error in script execution")

2024-05-29 15:44:36 [INFO] Starting preprocessing
2024-05-29 15:44:38 [INFO] Writing facebook_data/processed/normalized_posts_facebook.json
Processing Facebook posts: 100%|██████████| 18368/18368 [00:07<00:00, 2606.14it/s]
2024-05-29 15:44:46 [INFO] Writing reddit_data/processed/normalized_posts_reddit.json
Processing Reddit posts: 100%|██████████| 52235/52235 [00:59<00:00, 880.68it/s]
2024-05-29 15:45:46 [INFO] Writing twitter_data/processed/normalized_posts_twitter.json
Processing Twitter posts: 100%|██████████| 12507/12507 [00:00<00:00, 20650.40it/s]
2024-05-29 15:45:54 [INFO] Finished preprocessing


Script executed successfully


Step 1b: We pull in all our data to get approx. 1500-2000 comments

In [4]:
# Now we run data pull to sample our data

platforms = ['twitter', 'reddit', 'facebook']
platform_data = []

for platform in platforms:
    
    old_stdout = sys.stdout
    result = StringIO()
    sys.stdout = result

    # Pull data and append to list
    data_pull.random_user_feed_generator(platform, 1500, 1, 'username')
    sys.stdout = old_stdout
    result_string = result.getvalue()
    result = result_string.rstrip()
    data = json.loads(result)
    platform_data.append([{'id': item['id'], 'text': item['text']} for item in data['items'] if 'text' in item])

twitter =  platform_data[0]
reddit =  platform_data[1]
facebook =  platform_data[2]

Step 2a: Data has already been labelled but if you would like to relabel, you can run the following cells. Otherwise just pull in whatever platform you want data for

In [8]:
# Set your key here if you wish to train and label the data
client = OpenAI(api_key='')

In [5]:
# OpenAI has character limits so we will batch data
def create_batches(data, batch_size):
    """Yield successive n-sized batches from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

Step 2b: Now that the API is set up and works, we can label our data, giving each item a positivity score.

In [6]:
def labelling(dataset, platform):
    data_batches = create_batches(dataset, 10) # Adjust batch size if needed
    results = []
    for batch in data_batches:
        item_content = ""
        for idx, item in enumerate(batch):
            item_content += f"ITEM {idx}:\n{item['text']}\n\n"

        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": 'You are a helpful assistant that processes text and returns results in JSON format.Reorder the items you are given in terms of their positivity, with the most positive item first, and include a positivity score (0 - 1) to 2 dp. More positive items get a higher score.  Additionally, give it a sentiment label based off of the sentiment of their text: very negative, negative, neutral, positive, very positive. Give me a JSON array in the following format: [ {"item_idx": int, "score": int, "sentiment":str, "text": str} ]',
                    },
                    {
                        "role": "user",
                        "content": item_content,
                    }
                ],
            )
            chatgpt_data = response.choices[0].message.content.strip()
            results.append(chatgpt_data)
        except Exception as e:
            print(f"Failed to process batch: {str(e)}")
            
    current_dir = os.getcwd()

    # Define the path for the output file
    output_file_path = os.path.join(current_dir, f'{platform}_results.json')

    with open(output_file_path, 'w') as file:
        json.dump(results, file, indent=4)

In [9]:
# This will take awhile to run so only run if you wish to relabel data
# labelling(twitter, 'twitter')
# labelling(reddit, 'reddit')
# labelling(facebook, 'facebook')

2024-05-29 15:55:54 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:56:11 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:56:25 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:56:42 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:56:58 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:57:12 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:57:29 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:57:43 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:57:55 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 15:58:09 [INFO] HTTP Reque

Failed to process batch: Error code: 400 - {'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_prompt', 'param': 'prompt', 'code': None}}


2024-05-29 16:39:15 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:22 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:24 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:32 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:38 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:45 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:39:55 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:40:00 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:40:13 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-29 16:40:19 [INFO] HTTP Reque

In [10]:
with open('twitter_results.json', 'r') as file:
    labelled_twitter = json.load(file)
with open('reddit_results.json', 'r') as file:
    labelled_reddit = json.load(file)
with open('facebook_results.json', 'r') as file:
    labelled_facebook = json.load(file)

Step 2c: We can now split our data into train and test. We will use a roughly 80/20 split

In [11]:
def train_test_split(dataset, split):
    combined = []
    for index, json_string in enumerate(dataset):
        # Strip markdown code block syntax if present
        clean_json_string = json_string.strip('`json\n ')
        # Check if string is empty after cleaning
        if not clean_json_string or clean_json_string == '[]':
            print(f"Skipping empty or malformed input at index {index}")
            continue

        try:
            # Attempt to parse
            items = json.loads(clean_json_string)

            # Process each item in the parsed JSON array
            for item in items:
                item_idx = item['item_idx']
                score = item['score']
                sentiment = item['sentiment']
                data_text = item['text'] 

                # Append the combined data to the list
                combined.append({
                    "text": data_text,
                    "score": score,
                    "sentiment": sentiment
                })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON at index {index}: {clean_json_string}")
            print(f"JSON error: {e}")

    indices = np.arange(len(combined))
    np.random.shuffle(indices)

    sample_size = int(len(dataset) * (split))

    train_indices = indices[:sample_size]
    test_indices = indices[sample_size:]
    train = [combined[i] for i in train_indices]
    test = [combined[i] for i in test_indices]

    return train, test

In [12]:
twitter_train, twitter_test = train_test_split(labelled_twitter, 0.8)
reddit_train, reddit_test = train_test_split(labelled_reddit, 0.8)
facebook_train, facebook_test = train_test_split(labelled_facebook, 0.8)

Error decoding JSON at index 36: [
    {"item_idx": 9, "score": 1.00, "sentiment": "very positive", "text": "@HuckleberryGon 💙あなたにご紹介するキャラは… 【#トゥク】 サリー家の末っ子🌲 まだ幼く、全てに興味津々。 大胆でいたずら好きな一面も。 ""愛情""たっぷりの性格を持つ。 推しキャラに出会うまで 1/4(水)まで毎日チャレンジ✨ https://t.co/6fLGjR1XZn"},
    {"item_idx": 8, "score": 0.90, "sentiment": "positive", "text": "مركز مساج وتدليك في جدة و الرياض  ✅ خدمة منزلية و فندقية ✅ نعمل على مدار الساعة ( 24 ساعة  ) ✅ الدفع عند الوصول  ✅ تويتر تطبيق جيد✅ متاح الان منزلي فندقي بجدة بالرياض نخدم جميع مدن المملكة IH  DD ال"},
    {"item_idx": 4, "score": 0.80, "sentiment": "positive", "text": "RT @nobunaga_s: いつも応援してくださっている大切な皆様へ https://t.co/vl3k9jGz1A"},
    {"item_idx": 5, "score": 0.70, "sentiment": "neutral", "text": "umudumuz bay kemal #BayKemalMemuraZam"},
    {"item_idx": 2, "score": 0.60, "sentiment": "neutral", "text": "@jazzzmxn dónde"},
    {"item_idx": 7, "score": 0.55, "sentiment": "negative", "text": "RT @thickuncutpapi: Nothing like a DL fool swallowing your dick whole 

Step 3a: Next we need to split our data back into batches for training and define our prompt again

In [13]:
# Batch our data so we don't exceed token limits
twitter_data_batches = list(create_batches(twitter_train, 150))
reddit_data_batches = list(create_batches(reddit_train, 150))
facebook_data_batches = list(create_batches(facebook_train, 150))

Step 3b: We now need to import our mistral model and tokenize our data

In [25]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader

# Regular Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# BERT Large - Performs better but will be slower
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# model = BertModel.from_pretrained('bert-large-uncased')


class SentimentDataset():
    def __init__(self, texts, scores, tokenizer, max_len=512):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        score = self.scores[idx] if self.scores is not None else None

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        item = {
            'text' : text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if score is not None:
            item['score'] = torch.tensor(score, dtype=torch.float)
        
        return item


In [26]:
label_dict = {
    "very negative": 0,
    "negative": 1,
    "neutral": 2,
    "positive" : 3,
    "very positive" : 4
}

bert_train = []
bert_test = []

def get_label(text):
    return label_dict[text]

def prep_data(dataset, set='train'):
    prompts = [i['text'] for i in dataset]
    if set ==  'train':
        score = [i['score'] for i in dataset]
        prepped_data = SentimentDataset(prompts, score, tokenizer)
    elif set == 'test':
        prepped_data = SentimentDataset(prompts, None, tokenizer)
    
    return prepped_data

twitter_bert_train = prep_data(twitter_train)
twitter_bert_test = prep_data(twitter_test, 'test')
reddit_bert_train = prep_data(reddit_train)
reddit_bert_test = prep_data(reddit_test, 'test')
facebook_bert_train = prep_data(facebook_train)
facebook_bert_test = prep_data(facebook_test, 'test')


In [27]:
batch_size = 32
twitter_data_loader = DataLoader(twitter_bert_train, batch_size=batch_size, shuffle=True)
reddit_data_loader = DataLoader(reddit_bert_train, batch_size=batch_size, shuffle=True)
facebook_data_loader = DataLoader(facebook_bert_train, batch_size=batch_size, shuffle=True)

class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Adding a dropout layer for some regularization
        self.dropout = nn.Dropout(0.1)
        # A linear layer to output a single continuous value
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        # Extracting the pooled output from BERT's last hidden state
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # Applying dropout
        dropped_output = self.dropout(pooled_output)
        linear_output = self.linear(dropped_output)
        # Applying sigmoid and scaling to 0-100 range
        score = self.sigmoid(linear_output) * 100
        return score

# device = torch.device("cuda:3")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # May need to specify specific GPU if memory already largely in use
model = SentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [28]:
for epoch in range(5):
    model.train()
    for batch in twitter_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), scores)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'twitter_sentiment_model.pth')

KeyboardInterrupt: 

In [None]:
for epoch in range(5):
    model.train()
    for batch in facebook_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), scores)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'facebook_sentiment_model.pth')

In [None]:
for epoch in range(5):
    model.train()
    for batch in reddit_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)  # Ensure labels are float for regression

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = nn.MSELoss()(outputs.squeeze(), scores)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

torch.save(model.state_dict(), 'reddit_sentiment_model.pth')

In [None]:
twitter_test_data_loader = DataLoader(twitter_bert_test, batch_size=batch_size, shuffle=True)
reddit_test_data_loader = DataLoader(reddit_bert_test, batch_size=batch_size, shuffle=True)
facebook_test_data_loader = DataLoader(facebook_bert_test, batch_size=batch_size, shuffle=True)

In [None]:
def predict_sentiment(data_loader, model):
    model.eval() 
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Model inference
            outputs = model(input_ids, attention_mask)
            sentiment_scores = outputs.squeeze().tolist()  
            predictions.extend(sentiment_scores)
    return predictions

In [None]:
model_twit = SentimentClassifier().to(device)
model_twit.load_state_dict(torch.load('twitter_sentiment_model.pth', map_location=device))
model_twit.eval()
twitter_predictions = predict_sentiment(twitter_test_data_loader, model_twit)

In [None]:
model_facebook = SentimentClassifier().to(device)
model_facebook.load_state_dict(torch.load('facebook_sentiment_model.pth', map_location=device))
model_facebook.eval()
facebook_predictions = predict_sentiment(facebook_test_data_loader, model_facebook)

In [None]:
model_reddit = SentimentClassifier().to(device)
model_reddit.load_state_dict(torch.load('reddit_sentiment_model.pth', map_location=device))
model_reddit.eval()
reddit_predictions = predict_sentiment(reddit_test_data_loader, model_reddit)

Step 4: We now can compare rescaled outputs

In [None]:
def rescale_scores(outputs):
    outputs = torch.Tensor(outputs)  # Convert outputs to a PyTorch tensor if not already

    # Find the minimum and maximum values
    min_val = torch.min(outputs)
    max_val = torch.max(outputs)

    # Normalize the outputs to a 0-100 scale
    normalized_outputs = (outputs - min_val) / (max_val - min_val)

    return normalized_outputs.tolist()

scaled_predictions_twit = rescale_scores(twitter_predictions)
scaled_predictions_redd = rescale_scores(reddit_predictions)
scaled_predictions_face= rescale_scores(facebook_predictions)

twitter_scores = [twitter_test[i]['score'] for i in range(len(twitter_test))]
reddit_scores = [reddit_test[i]['score'] for i in range(len(reddit_test))]
facebook_scores = [facebook_test[i]['score'] for i in range(len(facebook_test))]

In [None]:
# optional MSE calc
import numpy as np

def mean_squared_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    squared_diff = (y_true - y_pred) ** 2
    
    mse = np.mean(squared_diff)
    return mse

print(mean_squared_error(twitter_scores, scaled_predictions_twit))
print(mean_squared_error(reddit_scores, scaled_predictions_redd))
print(mean_squared_error(facebook_scores, scaled_predictions_face))

Here we can run our ranking server from the above

In [None]:
from fastapi import FastAPI
from ranking_challenge.request import RankingRequest
from ranking_challenge.response import RankingResponse

app = FastAPI(
    title="Prosocial Ranking Challenge bert ranker",
    description="Ranks input by sentiment using distillbert trained off of GPT data",
    version="0.1.0",
)

@app.post("/rank")
def rank(ranking_request: RankingRequest) -> RankingResponse:
    ranked_results = []
    
    platform = ranking_request.session.platform
    # load model of platform choice in and set to eval mode
    model = SentimentClassifier().to(device)
    model.load_state_dict(torch.load(f'{platform}_sentiment_model.pth', map_location=device))
    model.eval() 
    
    # We need to prep data into dataloader for our model to accept it 
    prompts = []
    ids = []
    for item in ranking_request.items:
        text = item.text
        id = item.id 
        ids.append(id)
        prompts.append(text)
    prepped_data = SentimentDataset(prompts, None, tokenizer)
    data_loader = DataLoader(prepped_data, batch_size=batch_size, shuffle=True)
    
    # Then we need to predict scores so we can rank
    predictions = predict_sentiment(data_loader, model)
    rescaled = rescale_scores(predictions)
    rounded = [round(i,2) for i in rescaled]
    for i in range(len(rounded)):
        ranked_results.append(
        {"id": ids[i], "text": prompts[i], "scores": rounded[i]}
        )
    
    ranked_results.sort(key=lambda x: x["scores"], reverse=True)
    ranked_ids = [content["id"] for content in ranked_results]
    scores = [content["scores"] for content in ranked_results]

    result = {
        "ranked_ids": ranked_ids,
        "scores" : scores
    }

    return result



In [None]:
BASIC_EXAMPLE = {
    "session": {
        "session_id": "719f30a1-03bb-4d41-a654-138da5c43547",
        "user_id": "193a9e01-8849-4e1f-a42a-a859fa7f2ad3",
        "user_name_hash": "6511c5688bbb87798128695a283411a26da532df06e6e931a53416e379ddda0e",
        "platform": "reddit",
        "cohort": "AB",
        "url": "https://reddit.com/r/PRCExample/1f4deg/example_to_insert",
        "current_time": "2024-01-20 18:41:20",
    },
    "items": [
        {
            "id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
            "title": "this is the post title, available only on reddit",
            "text": "this is the worst thing I have ever seen! I cannot imagine a worse post",
            "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
            "embedded_urls": [],
            "type": "post",
            "created_at": "2023-12-06 17:02:11",
            "engagements": {"upvote": 34, "downvote": 27, "comment": 20, "award": 0},
        },
        {
            "id": "s5ad13266-8abk4-5219-kre5-2811022l7e43dv",
            "post_id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
            "parent_id": "",
            "text": "this is amazing! I love everything about this, this is the best.",
            "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
            "embedded_urls": [],
            "type": "comment",
            "created_at": "2023-12-08 11:32:12",
            "engagements": {"upvote": 15, "downvote": 2, "comment": 22, "award": 2},
        },
        {
            "id": "a4c08177-8db2-4507-acc1-1298220be98d",
            "post_id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
            "parent_id": "s5ad13266-8abk4-5219-kre5-2811022l7e43dv",
            "text": "this thing is ok. I am neutral",
            "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
            "embedded_urls": [],
            "type": "comment",
            "created_at": "2023-12-08 11:35:00",
            "engagements": {"upvote": 3, "downvote": 5, "comment": 10, "award": 0},
        },
    ],
}

import pytest
from fastapi.testclient import TestClient



In [None]:
def run_test_rank():
    client = TestClient(app)
    response = client.post("/rank", json=BASIC_EXAMPLE)
    
    assert response.status_code == 200, f"Expected status code 200, but got {response.status_code}"

    result = response.json()
    
    return result

    assert result["ranked_ids"][0:3] == [
        "s5ad13266-8abk4-5219-kre5-2811022l7e43dv",
        "a4c08177-8db2-4507-acc1-1298220be98d",
        "de83fc78-d648-444e-b20d-853bf05e4f0e",
    ], f"Unexpected ranked_ids order: {result['ranked_ids'][0:3]}"
    
    assert len(result.get("new_items", [])) == 1, f"Expected 1 new item, but got {len(result.get('new_items', []))}"

run_test_rank()
print("Test passed successfully!")
