In [None]:
from clef.utils.data_loading import load_datasets
from clef.utils.data_loading import write_trec_format_output
from clef.retrieval.retrieve import retrieve_evidence

train, dev = load_datasets(preprocess=True, add_author_info=True)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import re
import json

impl_wait_time = 10

def scrape_author_info(twitter_url: str, driver: webdriver.Chrome):
    # Open Twitter URL
    driver.get(twitter_url)

    # Fetch user's account name and bio
    account_name = driver.find_element(By.XPATH, '//div[contains(@class,"r-1wbh5a2")]//span//span').text

    # some users don't have a bio
    driver.implicitly_wait(0.1)
    bio = ''
    try:
        bio = driver.find_element(By.XPATH, '//div[contains(@data-testid,"UserDescription")]').text
    except NoSuchElementException:
        pass
    driver.implicitly_wait(impl_wait_time)
        
    # strip newlines
    bio = re.sub(r"\n", " ", bio)

    return (account_name, bio)


def translate_info(text: str, driver: webdriver.Chrome):
    # # Navigate to Google Translate
    driver.get("https://translate.google.com")

    # # Accept consent form
    # button = driver.find_element(By.XPATH, '//button[@aria-label="Reject all"]')
    # button.click()

    # Enter the bio into the translate text box
    input_box = driver.find_element(By.CLASS_NAME, "er8xn")
    input_box.send_keys(text)

    # Retrieve the translated text
    translated = driver.find_element(By.XPATH, '//span[@jsname="jqKxS"]').text

    # strip newlines
    translated = re.sub(r"\n", " ", translated)

    return translated

def get_author_info_dict(account_list, driver, fp):
    author_info_by_account = {}

    # setup empty dict
    for account in account_list:
        author_info_by_account[account.strip()] = {'name': '', 'bio': '', 'translated_name': '', 'translated_bio': '', 'error': ''}

    # get twitter info
    for i, account_url in enumerate(author_info_by_account.keys()):
        try:
            name, bio = scrape_author_info(account_url, driver)
            name = re.sub(r"@[\w]*", "", name)
            if name == '':
                print(f'[ERROR] couldn\'t retrieve info for account {account_url}')
                author_info_by_account[account_url]['error'] = 'user account unreachable;'
            author_info_by_account[account_url]['name'] = name
            author_info_by_account[account_url]['bio'] = bio
        except NoSuchElementException:
            print(f'[ERROR] couldn\'t retrieve info for account {account_url}')
            author_info_by_account[account_url]['error'] = 'user account unreachable;'

    # write dict to file
    with open(fp, 'w') as file:
        json.dump(author_info_by_account, file, indent=4, sort_keys=False)
        print(f'wrote dict with {len(author_info_by_account)} entries to {fp}')
        
    return author_info_by_account

def translate_dict(author_info_by_account, driver, fp):
    for i, account_url in enumerate(author_info_by_account.keys()):
        try:
            name = author_info_by_account[account_url]['name']
            bio = author_info_by_account[account_url]['bio']
            if name and bio:
                author_info_by_account[account_url]['translated_name'] = translate_info(name, driver)
                author_info_by_account[account_url]['translated_bio'] = translate_info(bio, driver)
        except NoSuchElementException:
            print(f'[ERROR] couldn\'t translate info for account {account_url}')
            author_info_by_account[account_url]['error'] += 'could not translate;'

    # write final dict to file
    with open(fp, 'w') as file:
        json.dump(author_info_by_account, file, indent=4, sort_keys=False)
        print(f'wrote dict with {len(author_info_by_account)} entries to {fp}')
    
    return author_info_by_account

In [None]:
# setup Selenium
driver = webdriver.Chrome()
driver.implicitly_wait(impl_wait_time) # set default waiting strategy
driver.get("https://translate.google.com")

# Accept consent form
button = driver.find_element(By.XPATH, '//button[@aria-label="Reject all"]')
button.click()

driver.get("https://twitter.com/login")

In [None]:
accounts = []
for item in dev:
    for account, id, tweet, in item['timeline']:
        accounts += [account]

accounts = list(set(accounts))
# accounts

In [None]:
fp_out = 'author-data.json'
info = get_author_info_dict(accounts, driver, fp_out)

In [None]:
# twitter cuts me off at about 100 requests, so we go again for the last few
# could maybe be fixed using sleep waiting
missing = ['https://twitter.com/TrablusBe',
'https://twitter.com/Hakomitna',
'https://twitter.com/KasbahTn',
'https://twitter.com/hazemaq',
'https://twitter.com/SerajSat',
'https://twitter.com/Moshir_Almasry',
'https://twitter.com/mosa_abumarzook',
'https://twitter.com/MofaQatar_AR',
'https://twitter.com/ofirgendelman',
'https://twitter.com/pmofa',
'https://twitter.com/ibrahimmilhim',
'https://twitter.com/HananBalkhy',
'https://twitter.com/UNNewsArabic',
'https://twitter.com/OmanEmbassydoha',
'https://twitter.com/FMofOman',
'https://twitter.com/Oman_GC',]

fp_out = 'missing.json'

missinginfo = get_author_info_dict(missing, driver, fp_out)

In [None]:
import json

def merge_dicts(d1, d2):
    """
    Merge two dictionaries with dictionary values.
    In case of common keys, their dictionary values are also merged,
    with values from d2 taking precedence in case of key conflicts.
    """
    merged_dict = {**d1}  # Start with the keys and values from d1
    
    for key, value in d2.items():
        if key in d1 and isinstance(d1[key], dict) and isinstance(value, dict):
            # If the key is common and both values are dictionaries, merge them
            merged_dict[key] = merge_dicts(d1[key], value)
        else:
            # Otherwise, use the value from d2, overriding any existing value in d1
            merged_dict[key] = value
    
    return merged_dict

def load_json_files(file_paths):
    """Load multiple JSON files and return their contents as dictionaries."""
    data = []
    for path in file_paths:
        with open(path, 'r') as file:
            data.append(json.load(file))
    return data



# Example usage
json_files = ['author-data.json', 'missing.json']
dicts = load_json_files(json_files)

final_data = merge_dicts(dicts[0], dicts[1])

fp = 'combined.json'
with open(fp, 'w') as file:
    json.dump(final_data, file, indent=4, sort_keys=False)
    print(f'wrote dict with {len(final_data)} entries to {fp}')


In [None]:
fp = 'combined.json'
with open(fp, 'r') as file:
    info = json.load(file)
# info

In [None]:
fp_out = 'author-data-translated.json'
info_with_translation = translate_dict(info, driver, fp_out)

In [None]:
# Close the browser
driver.quit()