# User generation

In [None]:
import requests as r
import json
from time import sleep
from IPython.display import clear_output
import hashlib
from pathlib import Path


In [None]:
path = {
    'users':'..\\data\\users',
    'clean':'..\\data\\clean',
    'database':'..\\database\\news.db'
}

Path('..\\data\\clean').mkdir(parents=True, exist_ok=True)
Path('..\\data\\users').mkdir(parents=True, exist_ok=True)


## Delay requests

In [None]:
users = []
n_users = 150000
users_per_req = 2000

for index in range(0, int(n_users/users_per_req)):

    users_json = r.get(f'https://randomuser.me/api/?results={users_per_req}&exc=phone,nat,id,registered').json()
    if 'error' in users_json:
        print(users_json['error'])
        break
    users.extend(users_json['results'])
    clear_output(wait=True)
    print(f'Generated users: {len(users)}')

    sleep(30)

with open(f"{path['users']}\\users.json", 'w') as fout:
    json.dump(users, fout, indent=1)

## Data cleaning

In [None]:
with open(f"{path['users']}\\users.json", 'r') as fin:
    users = json.load(fin)

print(len(users))

In [None]:
fields = users[0].keys()

sub_fields = [
    None,
     ['first', 'last'],
     ['street', 'city', 'state', 'country', 'postcode'],
     None,
     ['password'],
     ['date'],
     None,
     ['large']
]

users_filtered = []
for user in users:
    user_filtered = {}
    for key, sub_keys in zip(fields, sub_fields):
        if key == 'dob':
             user_filtered[key] = user[key]['date'].split('T')[0]
        elif key == 'login':
             user_filtered['password'] = hashlib.sha256(user['email'].split('@')[0].encode('UTF-8')).hexdigest()
        elif key == 'name':
            user_filtered[key] = ' '.join([user[key][sub_key] for sub_key in sub_keys])
        elif key == 'location':
            user_filtered['location'] = f"{user[key]['street']['name']}, {user[key]['street']['number']} - {user[key]['city']}, {user[key]['state']}, {user[key]['postcode']}"
            user_filtered['country'] = user[key]['country']
        elif sub_keys is None and key == 'email':
             user_filtered[key] = user[key].replace('example','gmail')
        elif sub_keys is None:
            user_filtered[key] = user[key]
        elif len(sub_keys) == 1:
            user_filtered[key] = user[key][sub_keys[0]]
        else:
            user_filtered[key] = dict([(sub_key, user[key][sub_key]) for sub_key in sub_keys])

    for key in user_filtered.keys():
        user_filtered[key] = user_filtered[key].encode('ascii','xmlcharrefreplace').decode('ascii')
    if user_filtered['country'] != 'Iran':
        users_filtered.append(user_filtered)

with open(f"{path['clean']}\\users_filtered.json", 'w') as fout:
    json.dump(users_filtered, fout, indent=1)