In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import random
from datetime import datetime, timedelta
import uuid
import json

In [2]:
"""
    This functions simulates the actions of single users 
    :param attack_id: unique identifier of the attack 
        :type attack_id: string
    :param number_posts: number of tweets that should be submitted by the user 
        :type number_posts: int
    :param start: Timestamp of the attack start in the format "YYY-MM-DD HH:MM:SS"
        :type start: string
    :param end: Timestamp of the attack end in the format "YYY-MM-DD HH:MM:SS"
        :type end: string
    :param user_id: unique identifier assigned to the artificially created user 
        :type user_id: string
    :param texts: texts that should be sent by users 
        :type texts: string
    :param pre_posting: whether the accounts should tweet in the hours before the actual campaign 
        :type pre_posting: bool
    :param pre_posting: whether the accounts should tweet in the hours after the actual campaign 
        :type pre_posting: bool
"""
def tweet_account(attack_id, number_posts, start, end, user_id, texts, pre_posting, post_posting):
    t_pre = []
    t_post = []

    # If the account should tweet before the actual attack, the time frame and the number of tweets must be defined
    if pre_posting:
        st_pre = start - timedelta(minutes=random.randrange(1, 46))
        pre_tweets = random.randrange(1, 15)
        et_pre = start
        d_pre = et_pre - st_pre
        t_pre = [random.random() * d_pre + st_pre for _ in range(pre_tweets)]
    else:
        pre_tweets = 0

    # If the account should tweet after the actual attack, the time frame and the number of tweets must be defined
    if post_posting:
        post_tweets = random.randrange(1, 43)
        st_post = end
        et_post = end + timedelta(minutes=random.randrange(1, 219))
        d_post = et_post - st_post
        t_post = [random.random() * d_post + st_post for _ in range(post_tweets)]
    else:
        post_tweets = 0

    # Artificially create uniformly distributed timestamps during the attack time
    duration = end - start
    t = [random.random() * duration + start for _ in range(number_posts)]

    # Add and sort the timestamp from the pre-attack, the attack and the post-attack
    timestamp = t_pre + t + t_post
    timestamp.sort()
    # Microseconds have to be deleted since textClust is not able to handle them
    timestamp = [datetime.replace(t, microsecond=0) for t in timestamp]

    # Count the number of Tweets that have to be submitted
    number_tweets = (number_posts + pre_tweets + post_tweets)
    # Create unique Tweet-IDs
    tweet_id = [int(str(uuid.uuid4().int)[0:10]) for _ in range(number_tweets)]
    # Create User-IDs
    user_ids = [('user' + user_id)] * number_tweets
    # Add the attackID
    attack_id = [attack_id] * number_tweets
    # Randomly sample Tweets from the GPT-generated Tweets for this attack
    texts.reset_index(drop=True, inplace=True)
    idx_tweets = [random.randrange(0, len(texts) - 1) for _ in range(number_tweets)]
    texts = [texts['tweets'][index] for index in idx_tweets]

    # Store and return a data frame
    df = pd.DataFrame({'timestamp': timestamp, 'tweets': texts, 'tweetID': tweet_id, 'userID': user_ids, 'user': attack_id})

    return df

In [3]:
"""
    This functions builds the final campaigns by concatenating user actions 
    :param configuration: Link to a json configuration file in json-Format 
    :type configuration: string
"""
def build_stereotype(configuration="config_stereotypes.json"):
    # Load the config file
    config = json.load(open(configuration))
    stereotype = config['stereotype']

    # Load the artificial tweets
    all_gpt_tweets = pd.read_csv(config['path_to_tweets'] + config['artificial_tweets'], 
                                 delimiter=";", encoding='utf-8')
    # Define an empty data frame to concatenate the generated actions per account later
    selected_tweets = pd.DataFrame(data={'timestamp': [], 'tweets': [], 'tweetID': [], 'userID': [], 'user': []})

    # Format the start date
    date_format = '%Y-%m-%d %H:%M:%S'
    start = datetime.strptime(config['start'], date_format)

    # Set a seed to make the stereotype generation repeatable
    random.seed(config['seed'])

    # Look up how many single attacks are executed in that specific stereotype
    attacks = ['attack' + str(i + 1) for i in range(config['number_attacks'][stereotype])]

    # For each of the attacks, concatenate the actions of the users
    for attack in attacks:
        # Look up relevant variables
        duration = config['duration'][stereotype][attack]
        number_posts = config['number_posts'][stereotype][attack]
        pause = config['pause'][stereotype][attack]

        # Randomly generate duration, the number of tweets and the pauses between attacks according to info in config file
        d = random.randrange(duration[0], duration[1])
        nbp = random.randrange(number_posts[0], number_posts[1])
        p = random.randrange(pause[0], pause[1])
        # Calculate when the attack will end
        end = start + timedelta(minutes=d)
        
        unique_users = all_gpt_tweets['user'].unique()
        # For each user, simulate their tweeting behaviour
        for i in range(config['number_accounts']):
            if config['number_accounts'] > 1: 
                u = unique_users[i]
                filtered_users = all_gpt_tweets[all_gpt_tweets["user"] == u]
                
            else: 
                filtered_users = all_gpt_tweets
            # Generate a user_id user id
            user = str(attack[6]) + '_' + str(uuid.uuid4().int)[0:10]
            # Simulate the tweeting behaviour of single account
            df_temp = tweet_account(attack_id=config['attack_id'], number_posts=nbp, start=start, end=end, user_id=user,
                                    texts=filtered_users, pre_posting=config['pre_posting'], post_posting=config['post_posting'])
            # Concatenate the actions of the accounts
            selected_tweets = pd.concat([selected_tweets, df_temp])

        # If a pause should be made between attack, add the time the accounts should wait
        start = end + timedelta(minutes=p)

    # Sort the dataset according to the timestamp
    selected_tweets = selected_tweets.sort_values(by='timestamp')
    # Reset the row index since the concatenation function repeats indices
    selected_tweets.reset_index(drop=True, inplace=True)
    return selected_tweets

In [4]:
"""
    This functions embeds artificial campaigns in a stream by replacing the original campaign 
    :param replace: Either a specific tweet pattern can be replaced or all tweets of a specific user 
    :type replace:  string, ["user", "tweet"]
    :param pattern: Tweet or username to be replaced 
    :type pattern: string
    :param configuration: Link to a json configuration file in json-Format 
    :type configuration: string
"""
def replace_tweets(replace, pattern, configuration="config_stereotypes.json"):

    # Load the configuration file
    config = json.load(open(configuration))
    # Extract the day from the start date
    day = config['start'][0:10]

    # Load the real, original Tweets recorded by the dashboard
    filename = config['path_to_day'] + day + '.json'
    original_tweets = pd.DataFrame(json.load(open(filename)))

    # Either delete the attack that should be simulated by identifying specific users or tweets
    if replace == 'user':
        relevant_tweets = original_tweets[~original_tweets['userID'].isin(pattern)]
    else:
        relevant_tweets = original_tweets[~original_tweets['tweets'].str.contains('|'.join(pattern))]

    relevant_tweets['timestamp'] = [datetime.strptime(string, '%Y-%m-%d %H:%M:%S') for string in relevant_tweets['timestamp']]
    # Build the artificial stereotype
    artificial_tweets = build_stereotype(configuration)

    # Merge the real Tweets with the artificial stereotype and sort the timestamps
    tweets = pd.concat([relevant_tweets, artificial_tweets])
    tweets.reset_index(drop=True, inplace=True)
    tweets = tweets.sort_values(by='timestamp')
    tweets['timestamp'] = [str(t) for t in tweets['timestamp']]

    # Store the complete file
    tweets.to_json(config['path_to_save'] + "tweets_" + config['attack_id'] + '.json')

    return tweets

In [5]:
replace_tweets(replace="tweet", pattern=["@POTUS: Keep fighting for a bold #BuildBackBetterAct for climate"])

Unnamed: 0,timestamp,tweets,tweetID,userID,user
0,2021-10-08 00:00:01,"Zix - Digital Agency &amp, Multipurpose WordPr...",1446264081061662724,Kevin David,1193439773748290048.0
1,2021-10-08 00:00:01,"Another oil spill, we should save marine life..",1446264078486310923,요 소나,1407945419682810112.0
2,2021-10-08 00:00:01,A new Pentagon plan calls for incorporating th...,1446264079526531076,KTXS News,239861560.0
3,2021-10-08 00:00:01,"A carbon-neutral economy isn't cheap, but it i...",1446264079534886927,Reimagine Agriculture,1192640803375589888.0
4,2021-10-08 00:00:01,@TheTRCP @JohnBoozman @SenCortezMasto Must be ...,1446264079782383616,Terry Byte,1429516827303020032.0
...,...,...,...,...,...
68237,2021-10-08 23:59:46,‘Greenies’ at @BCAcomau are the latest calling...,1446626402506182658,Jane Garcia,28065723.0
68238,2021-10-08 23:59:48,May I suggest you trace my tweets and think ab...,1446626411192725513,Peter Principas,377729297.0
68239,2021-10-08 23:59:49,@laurenboebert Google “climate refugees” if yo...,1446626418885120001,Ernest,1325974610429400064.0
68240,2021-10-08 23:59:50,@Liz80944633 @Nerdy_Addict Usually things are ...,1446626422865399811,izzy,89006036.0
