In [140]:
# Load in the data
from io import StringIO
import json
import sys
import os
import subprocess

import numpy as np
import pandas as pd
import requests


We will sample 2000 posts from each, and then print to demonstrate the JSON structure of each 

In [142]:
# Create directories if needed
directories = [
    'facebook_data/processed',
    'reddit_data/processed',
    'twitter_data/processed'
]
for directory in directories:
    os.makedirs(directory, exist_ok=True)
    
current_directory = os.getcwd()
path_to_add = os.path.abspath(os.path.join(current_directory, '..'))
sys.path.append(path_to_add)

from sample_data import preprocessing, data_pull

target_file_path = os.path.abspath(os.path.join(current_directory, '..', '..', 'sample_data', 'preprocessing.py'))
result = subprocess.run(['python', '../../sample_data/preprocessing.py'], capture_output=True, text=True)
platforms = ['twitter', 'reddit', 'facebook']
platform_data = []

for platform in platforms:
    
    old_stdout = sys.stdout
    result = StringIO()
    sys.stdout = result

    # Pull data and append to list
    data_pull.random_user_feed_generator(platform, 2000, 1, 'username')
    sys.stdout = old_stdout
    result_string = result.getvalue()
    result = result_string.rstrip()
    data = json.loads(result)
    platform_data.append(data)

twitter =  platform_data[0]
reddit =  platform_data[1]
facebook =  platform_data[2]

In [19]:
print(twitter)
print(reddit)
print(facebook)

{'session': {'user_id': '1b5aa9237c15190aa1453b3f9f7d1068e1ac8d711b49ed8e36650db2dc23f9b2', 'user_name_hash': '16f78a7d6317f102bbd95fc9a4f3ff2e3249287690b8bdad6b7810f82b34ace3', 'cohort': 'XX', 'platform': 'twitter', 'current_time': '2024-05-23T13:25:06.938927'}, 'survey': None, 'items': [{'id': '99b71f1e35b51a6349b0adb823468033207084d50896e2a305d3ab1e8ec45e82', 'post_id': None, 'parent_id': 'c3da7c236c5fb8b30725a7f094b63c2267d32c8e8c7304865dd1c71836528886', 'title': None, 'text': 'RT @lovelydaylight_: en tan poco tiempo te convertiste en tanto', 'author_name_hash': 'de982ed63fef4b093fdf1c8bbf2c59d6ba079293204e28d774ad924ecb7aa63b', 'type': 'post', 'embedded_urls': [], 'created_at': '2023-01-01T23:59:40', 'engagements': {'retweet': 0, 'like': 0, 'comment': 0, 'share': 0}}, {'id': '910428677d3c49f85e11d2fce3d5e2204f42e82e3ead084b3b15fde5c6c50010', 'post_id': None, 'parent_id': 'eb0687d4a1d763e63ad80053cf928e201565b1c9c019ad2789e3d426619e928e', 'title': None, 'text': 'RT @nookker_ss: “วิ

With our 600 samples across each platform, we can define a function to sample ~50 posts. We will need to define an object to store our user and the posts that they have seen to ensure that they do not see the same posts over and over again, while ensuring that posts seen by one user do not affect whether the post is seen by another. We will define a class called 'user_history' to achieve this 

In [153]:
class user_history():
    def __init__(self, feed,  history):
        '''This class takes as inputs:
        user - This is the individual for whom the feed is being generated. It is pulled from the feed
        feed - These are the ~600 posts that our user is sampling from. It is pulled from the feed
        history - This is a dictionary containing the existing history for all users, else if empty is instantiates an empty dictionary to hold history.'''
        
        self.user = feed['session']['user_id']
        self.feed = feed['items']
        self.history = history 
        self.ranked_feed = []
        
    def update_history(self, sample):
        print(sample)
        for item in sample:
            if item['id'] in self.history[self.user]:
                self.history[self.user][item['id']] += 1
            else:
                self.history[self.user][item['id']] = 1
                
    def sample_feed(self, size=50):
        first_use = True
        if self.user in self.history.keys():
            first_use = False
            
        if first_use == True:
            self.history[self.user] = {}
            sample = np.random.choice(self.feed, size=size,replace=False)
            self.update_history(sample)
        else:
            
            seen_ids = {item for item in self.history[self.user]}
            seen_posts = [post for post in self.feed if post['id'] in seen_ids]
            unseen_posts = [post for post in self.feed if post['id'] not in seen_ids]

            repeat_posts = min(np.random.randint(5), len(seen_posts)) # This will be the number of repeat posts our users sees
            # if len(repeat_posts) > len(unseen_posts):
            new_posts = size - repeat_posts # This will be the number of new posts our users sees 
            if len(unseen_posts) < new_posts:
                seen_sample = np.random.choice(list(seen_posts), size = repeat_posts, replace= False)
                unseen_sample = np.random.choice(seen_posts, size = new_posts, replace= False)
            else:
                seen_sample = np.random.choice(list(seen_posts), size = repeat_posts, replace= False)
                unseen_sample = np.random.choice(unseen_posts, size = new_posts, replace= False)
            # shuffles feed 
            sample = np.concatenate([seen_sample, unseen_sample])
            np.random.shuffle(sample)
            self.update_history(sample)

        return sample   

    def run_ranker(self, sample_count = 600):
        for i in range(sample_count):
            self.sample_feed(size=50)
            
    def create_feed(self, history):
        

    # def write_to_json(self, platform, history):
    #     file_path = f'{platform}_feed.json'
    #     with open(file_path, 'w') as file:
    #         json.dump(history, file, indent=4)

We can now test out our function by sampling posts once and then sampling again to test out repeat posts

In [139]:
history = {}
user_feed = user_history(twitter, history)

# Sample the feed
sampled_posts = user_feed.sample_feed(50)
print("History after first sampling:", history)

user_feed1 = user_history(twitter, history)
sampled_posts1 = user_feed1.sample_feed(50)
print("History after second sampling:", history)

50
History after first sampling: {'1b5aa9237c15190aa1453b3f9f7d1068e1ac8d711b49ed8e36650db2dc23f9b2': {'a8486c1d417b6913cf64d405856cfe0eb05f3f432ba6d78817ba6fd6a4840968': 1, '0624b6e02ab54febc78e2586489fd4a05ec99bb32c935bfeb6b76574b558aca8': 1, '3394b07474bdc971833a34cc7d57730a2eedf36e78802a0fe98737be8774a4d5': 1, 'c18a16b8f31979a907e2f95de95d31e5d4260d1f515610560a296258768a6841': 1, '80f9a7c1a5d6e7a473c1f018003df8c07385b92a88985ae9789f8efdd4caa29b': 1, 'e832bafd6febb3bf3da30b9f2995388898e8c4a70d974bb693a4d483249900f7': 1, 'ad03ec6ace60847fb594ee9147e9497c0bbf506975d7878878b92ce8e29de778': 1, 'aa39d4a2474ec2a907b3da39e3690a0b460667fbe78051b14e3608ce2d8cf7d6': 1, '092e37cc95c1e4862e57aa08dcceea3d7b326da3246fea66e1b0dd12f149aae6': 1, 'ea4049b5ccbe90b444354d57c765de9bed24a21af26e9008b69898ba49d77707': 1, '547e9cfca8d86ea3ddb6c6ebb06813a6d16c7f834201605bd6e22e7f94bd468b': 1, '45041fee0a3823fef28062999d5cc9f08682cd3c44260ff50121872815088f9f': 1, '03240ef466f8ec9422b78181d08164d14c0752692ae9

In [154]:
twitter_hist = {}
reddit_hist = {}
facebook_hist = {}

histories  = [twitter_hist, reddit_hist, facebook_hist]
platforms = [twitter, reddit, facebook]
platform_names = ['twitter', 'reddit', 'facebook']
for i in range(len(platforms)):
    platform = platforms[i]
    history = histories[i]
    platform_name = platform_names[i]
    user_feed = user_history(platform, history)
    user_feed.run_ranker()
    user_feed.write_to_json(platform_name, history)

[{'id': '847edee19cf07408c79b00d080d4ff6988e6528aa235c9c23563f034479e3b8b', 'post_id': None, 'parent_id': 'ddece7c7b9579b7d164f29c1008b8c021fc0287cbfb161ac662a94f381cdd2a5', 'title': None, 'text': 'RT @almlki__1: @nwrlhd1 سلمتي', 'author_name_hash': '5661a56dda77fba413870828463abfd30d16ef6c5debecf09f33167279c687de', 'type': 'post', 'embedded_urls': [], 'created_at': '2023-01-01T10:49:23', 'engagements': {'retweet': 4, 'like': 8, 'comment': 2, 'share': 0}}
 {'id': 'bac8b9babc11bb64d274881df28b135615bdc003d1176a6b066d4489bcea1bc8', 'post_id': None, 'parent_id': '263e813b60f59867904c422e54b9c6d49649917060793943de362c6726d17d37', 'title': None, 'text': 'RT @ThalapathyFilms: Biggest Pongal Winner #Varisu Not only in Tamil Telugu North also💥🥳This pongal is Ours💥😍 - #DilRaju https://t.co/YamPP…', 'author_name_hash': 'bf2e8b510e9e1fb00d596634037113931114347ee83be84ab4545e6a7ed53abe', 'type': 'post', 'embedded_urls': [], 'created_at': '2023-01-01T15:46:46', 'engagements': {'retweet': 1, 'like':

TypeError: unhashable type: 'dict'