# Generate Tweet

In [10]:
import numpy as np
import pandas as pd
import string
import random
import time
import csv

In [11]:
# define start and end timestamp
start = pd.to_datetime('2021-01-01')
end = pd.to_datetime('2021-01-31')

# randomly generates the timestamp
def random_dates(start, end):

    start_time = start.value // 10**9
    end_time = end.value // 10 ** 9

    return pd.to_datetime(np.random.randint(start_time, end_time), unit = 's')

In [12]:
# define allowed characters that can be used in the tweet text
allowed_chars = string.ascii_letters + string.punctuation

# randomly generate tweet text
def random_string_generator(allowed_chars):
    # tweet char length no more than 140
    string_size = random.randint(1, 140)
    return ''.join(random.choice(allowed_chars) for x in range(string_size))

In [15]:
# generate one tweet
def tweet_generator():
    
    tweet_id = random.randint(10001, 20000)
    # maximum 10000 users
    user_id = random.randint(100001, 101000)
    tweet_ts = random_dates(start, end)
    tweet_text = random_string_generator(allowed_chars)
    
    return [tweet_id, user_id, tweet_ts, tweet_text]
        

In [16]:
# generate n tweets to the file with given file_name
def generate_tweet(n, file_name):
    # the file we want to write the tweets to
    with open(file_name, 'w') as tweet_file:
        writer = csv.writer(tweet_file, delimiter='\t',lineterminator='\n',)
        writer.writerow(["index", "tweet_id", "user_id", "tweet_ts", "tweet_text"] )
        # the start time for writing
        start_time = time.time()
    
        for i in range(n):
            row = [i + 1] + tweet_generator()
            writer.writerow(row)
        
        
    tweet_file.close()    
    
    print('Time used to generate', n, 'tweets:', time.time() - start_time, 'seconds.')
        

In [17]:
generate_tweet(10000, 'tweet.csv')

Time used to generate 10000 tweets: 4.321894884109497 seconds.


# Generate Follows

In [18]:
# read in all the tweets generated before
data_df = pd.read_csv("tweet.csv", sep = "\t")

# get all the user id
list_user_id = data_df["user_id"].unique()

print(len(list_user_id))

1000


In [19]:
# From the output above, there are 946614 distinct tweets users

# Followers generation algorithm:

# For the first 1000 users : each follows 100 users
# For the 1001 to 2000 : each follows 1000 users
# For the 2001 to 3000 : each follows 10000 users
# For the rest of the users : each follows 10 users

In [20]:
# generate one follow relationship
def follower_generator(user_index):
    
    user_id = list_user_id[user_index]
    random_follows_index = random.randint(0, len(list_user_id) - 1)
    follows_id = list_user_id[random_follows_index]
    
    return [user_id, follows_id]

In [21]:
# generate n follower to the file with given file_name
def generate_follower(n, file_name):
    # the file we want to write the tweets to
    with open(file_name, 'w') as follower_file:
        writer = csv.writer(follower_file, delimiter='\t',lineterminator='\n',)
        writer.writerow(["index", "user_id", "follows_id"])
        
        # the start time for writing
        start_time = time.time()

        i = 0
        index = 0
        while i < n:
            j = 0
            # For the first 1000 users : each follows 100 users
            if i in range(0, 100):
                while j in range(10):
                    row = [index + 1] + follower_generator(i)
                    writer.writerow(row)
                    j += 1
                    index += 1
                i += 1 
                
            # For the 1001 to 2000 : each follows 300 users    
            elif i in range(100, 200):
                while j in range(30):
                    row = [index + 1] + follower_generator(i)
                    writer.writerow(row)
                    j += 1
                    index += 1
                i += 1    
                
            # For the 2001 to 3000 : each follows 500 users        
            elif i in range(200, 300):
                while j in range(50):
                    row = [index + 1] + follower_generator(i)
                    writer.writerow(row)
                    j += 1
                    index += 1
                i += 1 
                
            # For the rest of the users : each follows 10 users    
            else:
                while j in range(1):
                    row = [index + 1] + follower_generator(i)
                    writer.writerow(row)
                    j += 1
                    index += 1
                i += 1
                
            
    follower_file.close()    
    
    print('Time used to generate', index, 'followers:', time.time() - start_time, 'seconds.')        

In [22]:
generate_follower(len(list_user_id), 'follower.csv')

Time used to generate 9700 followers: 0.06743717193603516 seconds.


In [23]:
# check number of followers correct
1000 * 100 + 1000 * 300 + 1000 * 500 + 7000 * 10 == 970000

True

In [9]:
time_l = []
for i in range(0, 10):
    time_l.append(random_dates(start, end))

time_l.sort(reverse = True)
time_l

[Timestamp('2021-01-30 22:03:52'),
 Timestamp('2021-01-20 18:03:35'),
 Timestamp('2021-01-14 21:23:29'),
 Timestamp('2021-01-14 10:25:22'),
 Timestamp('2021-01-12 01:39:21'),
 Timestamp('2021-01-09 17:53:24'),
 Timestamp('2021-01-06 15:52:36'),
 Timestamp('2021-01-06 11:52:28'),
 Timestamp('2021-01-01 08:32:16'),
 Timestamp('2021-01-01 08:29:38')]

[1, 2, 3, 1, 2]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]