In [None]:
from datetime import datetime
from faker import Faker
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
social_media_users_df = pd.read_csv("./genres/genres.csv")

In [None]:
genres = set()

for _, row in tqdm(social_media_users_df[["Interests"]].iterrows(), total=len(social_media_users_df)):
    for genre in row["Interests"].replace("'", "").split(","):
        genre = genre.strip()
        genres.add(genre)

In [None]:
fake = Faker()

In [None]:
def create_fake_person():
    birth_date = fake.date_of_birth()
    age = datetime.date(datetime.now()).year - birth_date.year
    
    bot_score = fake.pyfloat(left_digits=0, right_digits=2, positive=True, min_value=None, max_value=None)
  
    _, _, city, country, state = fake.local_latlng(country_code="US")
    state = state.split("/")[-1].replace("_", " ")
    location = " ".join([city, state, country])
  
    return [fake.uuid4(), fake.first_name(), fake.last_name(), birth_date.strftime("%Y-%m-%d"), age, location, fake.random_element(genres), bot_score]

In [None]:
people = [create_fake_person() for _ in range(1000)]

In [None]:
def create_fake_poster():
    birth_date = fake.date_of_birth()
    age = datetime.date(datetime.now()).year - birth_date.year
    
    return [fake.uuid4(), fake.user_name(), age, ";".join([fake.random_element(genres) for _ in range(fake.random_int(min=1, max=5))])]

In [None]:
posters = [create_fake_poster() for _ in range(200)]

In [None]:
followers = []

for person in people:
    followers.extend([[person[0], fake.random_element(people)[0]] for _ in range(fake.random_int(min=2, max=10))])

In [None]:
subscribers = []

for person in people:
    subscribers.extend([[person[0], fake.random_element(posters)[0]] for _ in range(fake.random_int(min=2, max=10))])

In [None]:
people_df = pd.DataFrame(people, columns=["personId:ID(Person-ID)", "first_name", "last_name", "birth_date:date", "age:int", "location", "genre", "bot_score:float"])
people_df[":LABEL"] = "person"
people_df.head()

In [None]:
posters_df = pd.DataFrame(posters, columns=["posterId:ID(Poster-ID)", "username", "age:int", "genres:string[]"])
posters_df[":LABEL"] = "poster"
posters_df.head()

In [None]:
followers_df = pd.DataFrame(followers, columns=["personId:START_ID(Person-ID)", "personId:END_ID(Person-ID)"])
followers_df[":TYPE"] = "follows"
followers_df.head()

In [None]:
subscribers_df = pd.DataFrame(subscribers, columns=["personId:START_ID(Person-ID)", "posterId:END_ID(Poster-ID)"])
subscribers_df[":TYPE"] = "subscribes_to"
subscribers_df.head()

In [None]:
people_df.to_csv("./fake-social-media-data/people.csv", index=False)
posters_df.to_csv("./fake-social-media-data/posters.csv", index=False)
followers_df.to_csv("./fake-social-media-data/followers.csv", index=False)
subscribers_df.to_csv("./fake-social-media-data/subscribers.csv", index=False)