# User generation

In [2]:
import uuid

import pandas as pd
import requests as r
import json
from time import sleep
from IPython.display import clear_output
import hashlib
from pathlib import Path
import sqlite3
import random

path = {
    'users':'..\\data\\users',
    'clean':'..\\data\\clean',
    'database':'..\\database\\news.db'
}

Path('..\\data\\clean').mkdir(parents=True, exist_ok=True)
Path('..\\data\\users').mkdir(parents=True, exist_ok=True)
Path('..\\data\\clean\\pics').mkdir(parents=True, exist_ok=True)


## Delay requests

In [None]:
users = []
n_users = 150000
users_per_req = 2000

for index in range(0, int(n_users/users_per_req)):

    users_json = r.get(f'https://randomuser.me/api/?results={users_per_req}&exc=phone,nat,id,registered').json()
    if 'error' in users_json:
        print(users_json['error'])
        break
    users.extend(users_json['results'])
    clear_output(wait=True)
    print(f'Generated users: {len(users)}')

    sleep(30)

with open(f"{path['users']}\\users.json", 'w') as fout:
    json.dump(users, fout, indent=1)

In [67]:
for index in range(0,100):
    pictures = r.get(f'https://randomuser.me/api/portraits/men/{index}.jpg'), r.get(f'https://randomuser.me/api/portraits/women/{index}.jpg')
    with open(f"{path['clean']}\\pics\\men_{index}.jpg", 'wb') as f:
        f.write(pictures[0].content)
    with open(f"{path['clean']}\\pics\\women_{index}.jpg", 'wb') as f:
        f.write(pictures[1].content)

## Data cleaning

In [3]:
with open(f"{path['users']}\\users.json", 'r') as fin:
    users = json.load(fin)

In [36]:
import pycountry
import unicodedata

fields = users[0].keys()

sub_fields = [
    None,
     ['first', 'last'],
     ['street', 'city', 'state', 'country', 'postcode'],
     None,
     ['password'],
     ['date'],
     None,
     ['large']
]

users_filtered = []
for user in users:
    user_filtered = {}
    for key, sub_keys in zip(fields, sub_fields):
        if key == 'dob':
             user_filtered['dateOfBirth'] = user[key]['date'].split('T')[0]
        elif key == 'login':
             user_filtered['password'] = hashlib.sha256(user['email'].split('@')[0].encode('UTF-8')).hexdigest()
        elif key == 'name':
            user_filtered['fullName'] = unicodedata.normalize('NFD', ' '.join([user[key][sub_key] for sub_key in sub_keys])).encode('ascii', 'ignore').decode("ASCII")
        elif (key == 'location') and (user[key]['country'] != 'Iran'):
            user_filtered['country'] = pycountry.countries.get(name=user[key]['country']).alpha_2
            user_filtered['location'] = f"{user[key]['street']['name']}, {user[key]['street']['number']} - {user[key]['city']}, {user[key]['postcode']} {user_filtered['country']}"
        elif key == 'gender':
            user_filtered[key] = user[key].capitalize()
        elif key == 'cell':
            user_filtered[key] = user[key]#.replace('-','').replace(' ', '').replace('(','').replace(')','')
        # elif key == 'picture':
        #     user_filtered[key] = '/'.join(user[key]['large'].split('/')[-2:])
        elif sub_keys is None:
            user_filtered[key] = user[key]
        elif len(sub_keys) == 1:
            user_filtered[key] = user[key][sub_keys[0]]
            # user_filtered[key] = dict([(sub_key, user[key][sub_key]) for sub_key in sub_keys])

    user_filtered['_id'] = user['login']['uuid']

    if 'country' not in user_filtered:
        continue

    for key in user_filtered.keys():
        user_filtered[key] = user_filtered[key].encode('ascii','xmlcharrefreplace').decode('ascii')

    users_filtered.append(user_filtered)

with open(f"{path['clean']}\\users_filtered.json", 'w') as fout:
    json.dump(users_filtered, fout, indent=1)

In [37]:
with open(f"{path['clean']}\\users_filtered.json", 'r') as fin:
    user_filtered = json.load(fin)

with sqlite3.connect(path['database']) as conn:
    user_df = pd.DataFrame(user_filtered)
    user_df.to_sql('user', conn, if_exists='replace')

In [38]:
def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

with sqlite3.connect(path['database']) as conn:
    conn.row_factory = dict_factory
    cur = conn.cursor()
    unique_users_male = cur.execute("select * from user where gender = 'Male' group by fullName ").fetchall()
    unique_users_female = cur.execute("select * from user where gender ='Female' group by fullName ").fetchall()

In [40]:
import base64

random.seed(0)
random.shuffle(unique_users_male)
random.shuffle(unique_users_female)

reporters = unique_users_male[0:70] + unique_users_female[0:80]
readers = unique_users_male[100:63100] + unique_users_female[100:82600 + 4500]

for reader in readers[0:4500]:
    reader['gender'] = 'Other'

random.shuffle(reporters)
random.shuffle(readers)

index = {'Male': 0, 'Female':0}

for reporter in reporters:
    pic_path = f"{path['clean']}\\pics\\reporters\\{'men' if reporter['gender'] == 'Male' else 'women'}_{index[reporter['gender']]}.jpg"

    with open(pic_path, 'rb') as fin:
         encoded_pic = base64.b64encode(fin.read()).decode('utf-8')
    index[reporter['gender']] += 1

    reporter['picture'] = encoded_pic
    reporter['reporterId'] = str(uuid.uuid4())

reporters_df = pd.DataFrame(reporters).loc[:,['_id', 'email', 'password', 'fullName', 'gender', 'country', 'location', 'dateOfBirth', 'cell', 'picture', 'reporterId']]
readers_df = pd.DataFrame(readers).loc[:,['_id', 'email', 'password', 'fullName', 'gender', 'country']]

with open(f"{path['clean']}\\reporters_clean.json",'w') as fout:
    json.dump(reporters_df.iloc[:,:].to_dict('records'), fout, indent=1)

with open(f"{path['clean']}\\readers_clean.json",'w') as fout:
    json.dump(readers_df.iloc[:,:].to_dict('records'), fout, indent=1)

In [3]:
import requests as r
import json

HOSTNAME = '172.16.5.20'

with open(f"{path['clean']}\\reporters_clean.json") as fin:
    reporters = json.load(fin)

session = r.session()
login = session.post(f"http://{HOSTNAME}:8080/SocialNews/login?email={'f.cristofani@socialnews.com'}&password={'admin'}&accessType={'admin'}")

for reporter in reporters:
    reporter['password'] = reporter['email'].split('@')[0]
    res = session.post(f'http://{HOSTNAME}:8080/SocialNews/admin/addReporter', json.dumps(reporter))

session.close()