In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import re
import json
import time
from tqdm.auto import tqdm

impl_wait_time = 30

def scrape_author_info(twitter_url: str, driver: webdriver.Chrome):
    # Open Twitter URL
    driver.get(twitter_url)

    # Fetch user's account name and bio
    account_name = driver.find_element(By.XPATH, '//div[contains(@class,"r-1wbh5a2")]//span//span').text

    # some users don't have a bio
    driver.implicitly_wait(0.1)
    bio = ''
    try:
        bio = driver.find_element(By.XPATH, '//div[contains(@data-testid,"UserDescription")]').text
    except NoSuchElementException:
        pass
    driver.implicitly_wait(impl_wait_time)
    
    return (account_name, bio)


def translate_text(text: str, driver: webdriver.Chrome):
    # Enter the text into the translate text box
    input_box = driver.find_element(By.CLASS_NAME, "er8xn")
    input_box.send_keys(text)

    # Retrieve the translated text
    translated = driver.find_element(By.XPATH, '//span[@jsname="jqKxS"]').text

    # Clear the textbox
    btn_clear = driver.find_element(By.XPATH, '//button[@aria-label="Clear source text"]')
    btn_clear.click()

    return translated


def twitter_data_dict(account_list, driver, fp):
    author_info_by_account = {}

    # setup empty dict
    for account in account_list:
        author_info_by_account[account.strip()] = {'name': '', 'bio': '', 'translated_name': '', 'translated_bio': '', 'error': ''}

    # get twitter info
    for account_url in tqdm(author_info_by_account.keys()):
        try:
            time.sleep(10)
            name, bio = scrape_author_info(account_url, driver)
            name = re.sub(r"@[\w]*", "", name)
            bio = re.sub(r"\n", " ", bio)
            
            if name == '':
                print(f'[ERROR] couldn\'t retrieve info for account {account_url}')
                author_info_by_account[account_url]['error'] = 'user account unreachable;'
            
            author_info_by_account[account_url]['name'] = name
            author_info_by_account[account_url]['bio'] = bio
        except NoSuchElementException:
            print(f'[ERROR] could not retrieve info for account {account_url}')
            author_info_by_account[account_url]['error'] += 'user account unreachable;'

    # write dict to file
    with open(fp, 'w') as file:
        json.dump(author_info_by_account, file, indent=4, sort_keys=False)
        print(f'wrote dict with {len(author_info_by_account)} entries to {fp}')
        
    return author_info_by_account


def get_session_translate():
    driver = webdriver.Chrome()
    driver.implicitly_wait(impl_wait_time) # set default waiting strategy
    
    driver.get("https://translate.google.com")

    # Accept consent form
    button = driver.find_element(By.XPATH, '//button[@aria-label="Reject all"]')
    button.click()

    driver.get("https://translate.google.com")
    return driver


def translate_dict(author_info_by_account, fp):
    driver = get_session_translate()
    # Navigate to Google Translate
    driver.get("https://translate.google.com")
    for account_url in tqdm(author_info_by_account.keys()):
        try:
            time.sleep(2)
            name = author_info_by_account[account_url]['name']
            bio = author_info_by_account[account_url]['bio']
            if name and bio:
                author_info_by_account[account_url]['translated_name'] = translate_text(name, driver)
                author_info_by_account[account_url]['translated_bio'] = translate_text(bio, driver)
        except NoSuchElementException:
            print(f'[ERROR] couldn\'t translate info for account {account_url}, retrying with new session...')
            # author_info_by_account[account_url]['error'] += 'could not translate;'

            # restart driver
            driver.quit()
            driver = get_session_translate()

            time.sleep(10)

            # retry with new driver
            name = author_info_by_account[account_url]['name']
            bio = author_info_by_account[account_url]['bio']
            if name and bio:
                author_info_by_account[account_url]['translated_name'] = translate_text(name, driver)
                author_info_by_account[account_url]['translated_bio'] = translate_text(bio, driver)

    # write final dict to file
    with open(fp, 'w') as file:
        json.dump(author_info_by_account, file, indent=4, sort_keys=False)
        print(f'wrote dict with {len(author_info_by_account)} entries to {fp}')
    
    driver.quit()
    
    return author_info_by_account

## obtain twitter data

In [12]:
import os

from clef.utils.data_loading import AuredDataset

config = {
    "blind_run": True,
    "split": "test",
    "add_author_name": False,
    "add_author_bio": False,
    "retriever_k": 5,
    "out_dir": "./data-out/runs-test/oai",
    "preprocess": False,
    "retriever_label": "OPENAI",
    "verifier_label": "OPENAI",
    "normalize_scores": False,
    "scale": False,
    "ignore_nei": False,
    "fingerprint": "nonorm-noscale-noignore_nei"
}
root_path = '../'

data_path = os.path.join(root_path, 'clef2024-checkthat-lab', 'task5', 'data')
json_data_filepath = os.path.join(data_path, 'English_test.json')
    
ds = AuredDataset(json_data_filepath, **config)

In [17]:
accounts = []
for item in ds:
    for post in item['timeline']:
        if not post.url.startswith('https://'):
            accounts += [f'https://{post.url}']
        else: 
            accounts += [post.url]

accounts = list(set(accounts))
len(accounts)

129

In [18]:
accounts

['https://twitter.com/GCOQatar',
 'https://twitter.com/mofauae',
 'https://twitter.com/ALThani_M',
 'https://twitter.com/MBA_AlThani_',
 'https://twitter.com/alqaradawy',
 'https://twitter.com/grandserail',
 'https://twitter.com/MohamedBinZayed',
 'https://twitter.com/Israelipm_ar',
 'https://twitter.com/ChefGov_ma',
 'https://twitter.com/OJerandi',
 'https://twitter.com/KuwaitArmyGHQ',
 'https://twitter.com/QatarUniversity',
 'https://twitter.com/Jeddahhealth',
 'https://twitter.com/GCCSG',
 'https://twitter.com/NajlaElmangoush',
 'https://twitter.com/mohpegypt',
 'https://twitter.com/MofEgypt',
 'https://twitter.com/AdwaAlArifi',
 'https://twitter.com/WHOEMRO',
 'https://twitter.com/MinistryInfoLB',
 'https://twitter.com/kbsalsaud',
 'https://twitter.com/Lolwah_Alkhater',
 'https://twitter.com/DrABQ',
 'https://twitter.com/WHOEgypt',
 'https://twitter.com/HananBalkhy',
 'https://twitter.com/fifacom_ar',
 'https://twitter.com/WHOTunisia',
 'https://twitter.com/Russia_AR',
 'https://tw

In [19]:
# setup Selenium
driver = webdriver.Chrome()
driver.implicitly_wait(impl_wait_time) # set default waiting strategy

driver.get("https://twitter.com/login")
# login to account manually

In [20]:
fp_out = 'data/test-author-data.json'
info = twitter_data_dict(accounts, driver, fp_out)

  0%|          | 0/129 [00:00<?, ?it/s]

[ERROR] couldn't retrieve info for account https://twitter.com/SerajSat
[ERROR] couldn't retrieve info for account https://twitter.com/hazemaq
[ERROR] couldn't retrieve info for account https://twitter.com/KasbahTn
[ERROR] couldn't retrieve info for account https://twitter.com/Moshir_Almasry
[ERROR] couldn't retrieve info for account https://twitter.com/mosa_abumarzook
wrote dict with 129 entries to data/test-author-data.json


In [21]:
# Close the browser
driver.quit()

## translate data

In [22]:
fp = 'data/test-author-data.json'
with open(fp, 'r') as file:
    info = json.load(file)
info

{'https://twitter.com/GCOQatar': {'name': 'مكتب الاتصال الحكومي',
  'bio': "Qatar's Government Communications Office (GCO)",
  'translated_name': '',
  'translated_bio': '',
  'error': ''},
 'https://twitter.com/mofauae': {'name': 'MoFA وزارة الخارجية',
  'bio': 'الحساب الرسمي لوزارة الخارجية - الإمارات العربية المتحدة The official account of the Ministry of Foreign Affairs -UAE',
  'translated_name': '',
  'translated_bio': '',
  'error': ''},
 'https://twitter.com/ALThani_M': {'name': 'مــريــم آل ثــانــي',
  'bio': '(تَوَفَّنِي مُسْلِمًا وَأَلْحِقْنِي بِالصَّالِحِينَ) اللهم سخر لي من يدعو لي بعد مماتي',
  'translated_name': '',
  'translated_bio': '',
  'error': ''},
 'https://twitter.com/MBA_AlThani_': {'name': 'محمد بن عبدالرحمن',
  'bio': 'رئيس مجلس الوزراء وزير الخارجية - Prime Minister & Minister of Foreign Affairs',
  'translated_name': '',
  'translated_bio': '',
  'error': ''},
 'https://twitter.com/alqaradawy': {'name': 'يوسف القرضاوي',
  'bio': 'الحساب الرسمي للشيخ يوسف ا

In [23]:
fp_out = 'data/test-author-data-translated.json'
info_with_translation = translate_dict(info, fp_out)

  0%|          | 0/129 [00:00<?, ?it/s]

wrote dict with 129 entries to data/test-author-data-translated.json


## old code

In [None]:
# twitter cuts me off at about 100 requests, so we go again for the last few
# could maybe be fixed using sleep waiting
missing = ['https://twitter.com/TrablusBe',
'https://twitter.com/Hakomitna',
'https://twitter.com/KasbahTn',
'https://twitter.com/hazemaq',
'https://twitter.com/SerajSat',
'https://twitter.com/Moshir_Almasry',
'https://twitter.com/mosa_abumarzook',
'https://twitter.com/MofaQatar_AR',
'https://twitter.com/ofirgendelman',
'https://twitter.com/pmofa',
'https://twitter.com/ibrahimmilhim',
'https://twitter.com/HananBalkhy',
'https://twitter.com/UNNewsArabic',
'https://twitter.com/OmanEmbassydoha',
'https://twitter.com/FMofOman',
'https://twitter.com/Oman_GC',]

fp_out = 'data/missing.json'

missinginfo = twitter_data_dict(missing, driver, fp_out)

In [None]:

import json

def merge_dicts(d1, d2):
    """
    Merge two dictionaries with dictionary values.
    In case of common keys, their dictionary values are also merged,
    with values from d2 taking precedence in case of key conflicts.
    """
    merged_dict = {**d1}  # Start with the keys and values from d1
    
    for key, value in d2.items():
        if key in d1 and isinstance(d1[key], dict) and isinstance(value, dict):
            # If the key is common and both values are dictionaries, merge them
            merged_dict[key] = merge_dicts(d1[key], value)
        else:
            # Otherwise, use the value from d2, overriding any existing value in d1
            merged_dict[key] = value
    
    return merged_dict

def load_json_files(file_paths):
    """Load multiple JSON files and return their contents as dictionaries."""
    data = []
    for path in file_paths:
        with open(path, 'r') as file:
            data.append(json.load(file))
    return data

json_files = ['data/author-data.json', 'data/missing.json']
dicts = load_json_files(json_files)

final_data = merge_dicts(dicts[0], dicts[1])

fp = 'combined.json'
with open(fp, 'w') as file:
    json.dump(final_data, file, indent=4, sort_keys=False)
    print(f'wrote dict with {len(final_data)} entries to {fp}')
