# Downloads all posters from Indico

This script used Selenium to fetch all the info from Indico.

Make sure you have downloaded the Firefox geckodriver and it is in your `PATH`.

Also make sure you have exported your SSO password as env variable `SSO_PWD` and your SSO username as `SSO_USER` before starting this notebook.

To download geckodriver:

- Go to: https://github.com/mozilla/geckodriver/releases
- Download: geckodriver-v0.26.0-macos.tar.gz
- Place the geckodriver in some folder and make sure that folder is in your PATH.

In [1]:
import time
import datetime
import random
import os
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import json
import csv

Some settable variables:

In [2]:
# Debug flag
_debug = False

# Name of the output json file
_out_posters_json_file = 'posters.json'
_out_posters_csv_file = 'posters.csv'

# Where all posters will be dowloaded
_poster_dir = 'posters/'

# How log to max wait
_timeout = 15

In [3]:
# uploaded_posters_ids = []
all_posters = {}
all_posters['posters'] = []

## Open Session
Open an new webdriver session, go to Indico and sign in.

In [4]:
# Open indico
driver = webdriver.Firefox()
driver.get("https://indico.fnal.gov/event/19348/manage/contributions/") # url for our conference
# assert "INDICO" in driver.title

# Log-in via SSO
sso_btn = driver.find_element_by_class_name('external-provider-shib-sso')
sso_btn.click()

continue_btn = driver.find_element_by_class_name('ping-button')
continue_btn.click()

inputUser = driver.find_element_by_id('username')
inputUser.clear()
inputUser.send_keys(os.getenv('SSO_USER'))

inputPassword = driver.find_element_by_id('password')
inputPassword.clear()
inputPassword.send_keys(os.getenv('SSO_PWD'))
inputPassword.send_keys(Keys.RETURN)

# Wait for completion
time.sleep(3)

Next two cells was me playing around, we don't need to execute it.

In [5]:
%%bash -c : # This prevents cell from being executed

poster_table = driver.find_element_by_css_selector('table')
poster_table_lines = poster_table.find_elements_by_css_selector('tr')
# postertable[20].get_attribute('innerHTML')
# postertable[3].find_elements_by_tag_name('td')[3].get_attribute('innerHTML')
print('Number of posters', len(poster_table_lines))

poster_table_lines[1].find_elements_by_tag_name('td')[3].get_attribute('innerHTML')

cell = poster_table_lines[568].find_elements_by_tag_name('td')[6]
print(cell.get_attribute('innerHTML'))
cell.find_element_by_class_name('person-row').get_attribute('innerHTML')

cell = poster_table_lines[568].find_elements_by_tag_name('td')[11]
print(cell.get_attribute('innerHTML'))
material = cell.find_element_by_class_name('icon-attachment')
material.click()

In [6]:
%%bash -c : # This prevents cell from being executed
material_table = driver.find_element_by_class_name('tree')
material_table_lines = material_table.find_elements_by_css_selector('tr')
# postertable[20].get_attribute('innerHTML')
print('Number of materials', len(material_table_lines))
for row in material_table_lines:
    # The file name is the zero index td cell:
    cell = row.find_elements_by_tag_name('td')[0]
    print(cell.text)
    if '.pdf' not in cell.text:
        print('Not a PDF file!')
    material_link = cell.find_element_by_css_selector("[href]").get_attribute('href')
    print(material_link)
btn = driver.find_elements_by_class_name('ui-dialog-titlebar-close')
btn[0].click()

In [7]:
def find_poster(materials):
    '''
    Tries to find the poster pdf among all the 
    uploaded documents
    '''
    
    # Select PDF files only
    m = {k: v for k, v in materials.items() if '.pdf' in k}
    
    if not len(m):
        return 'NotFound', 'NotFound'
    
    # If there are more than 1 PDFs, select the one
    # that contains the word 'poster'
    if len(m) > 1:
        for k, v in m.items():
            if 'poster' in k.lower():
                return k, v
    
    # Otherwise return the first one
    return list(m.keys())[0], list(m.values())[0]

def find_video(materials):
    '''
    Tries to find the poster pdf among all the 
    uploaded documents
    '''
    
    extensions = ['.MOV', '.MPEG4', '.MP4', '.AVI', '.WMV', '.MPEGPS', '.FLV']
    
    for k, v in materials.items():
        for e in extensions:
            if e in k.upper():
                return k, v
            
    return 'NotFound', 'NotFound'


def get_materials_link():
    '''
    With a material pop-up window opened,
    this function reads all the materials uploaded
    and picks the first PDF file found.
    TODO: do we want the first? Maybe need to define
    another criterion
    
    returns:
    - link to poster file
    - link to video file
    - all links
    '''
    
    # Wait for completion
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'icon-folder-plus'))
        WebDriverWait(driver, _timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")

    material_table = driver.find_element_by_class_name('tree')
    material_table_lines = material_table.find_elements_by_css_selector('tr')
    
#     material_link = 'NotFound'
#     file_name = 'NotFound'

#     print('Number of materials', len(material_table_lines))
    
    all_materials = {}
    all_links = []
    
    for row in material_table_lines:
        # The file name is the zero index td cell:
        cell = row.find_elements_by_tag_name('td')[0]
        try:
            material_link = cell.find_element_by_css_selector("[href]").get_attribute('href')
            all_materials[cell.text] = material_link
            all_links.append(material_link)
        except:
            print('WARNING: Problem finding href element for', cell.text)
        
        
#         if _debug: print(cell.text)
#         if '.pdf' in cell.text:
#             file_name = cell.text
#             material_link = cell.find_element_by_css_selector("[href]").get_attribute('href')
#             if _debug: print(material_link)
#             break
    
    poster_file_name, poster_material_link = find_poster(all_materials)
    video_file_name, video_material_link = find_video(all_materials)
    
    return poster_material_link, video_material_link, all_links

Get the table with all the materials, and also get all the lines of this table.

In [8]:
poster_table = driver.find_element_by_css_selector('table')
poster_table_lines = poster_table.find_elements_by_css_selector('tr')

Loop over all the contribution table lines, and for each of them get:
- (element 1): the id
- (element 3): the link to the poster page
- (element 6): the presenters name
- (element 11): the link to the materials

In [9]:
def get_poster_info(poster_page):
    
    primary_authors = []
    co_authors = []
    
    driver.execute_script("window.open('" + poster_page + "');")
    driver.switch_to.window(driver.window_handles[1])
    
    try:
        element_present = EC.presence_of_element_located((By.ID, 'field-8814'))
        WebDriverWait(driver, _timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return primary_authors, co_authors, '', '', '', -1
        
    speaker_lists = driver.find_elements_by_class_name('speaker-list')
#     print(len(speaker_lists))
    
    # Primary Authors
    for element in speaker_lists[1].find_elements_by_tag_name('a'):
        primary_authors.append(element.find_element_by_tag_name('span').get_attribute('innerHTML'))
        
    # Co-Authors
    if len(speaker_lists) > 2:
        for element in speaker_lists[2].find_elements_by_tag_name('a'):
            co_authors.append(element.find_element_by_tag_name('span').get_attribute('innerHTML'))
            
    # Mini-abstract
    mini_abstract = driver.find_element_by_id('field-8814').find_element_by_tag_name('p').get_attribute('innerHTML')
#     print(mini_abstract)
    
    # Collaboration
    other_fields = driver.find_elements_by_class_name('other-fields')
    try:
        collaboration = other_fields[0].find_element_by_tag_name('td').get_attribute('innerHTML')
        collaboration = collaboration.replace('\n', '')
        collaboration = collaboration.replace('                            ', '')
        collaboration = collaboration.replace('                    ', '')
#         print('collaboration:', collaboration) 
    except:
        collaboration = ''
        
    # Abstract url
    abstract_url = driver.find_element_by_class_name('icon-file-pdf').get_attribute('href')
    
    # Constrib session
    try:
        poster_session = driver.find_element_by_class_name('contrib-session').find_element_by_tag_name('a').get_attribute('innerHTML')
        poster_session = int(poster_session.lower().replace('poster session ', ''))
    except:
        poster_session = -1
        
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    
    return primary_authors, co_authors, mini_abstract, collaboration, abstract_url, poster_session
    
# print(get_poster_info('https://indico.fnal.gov/event/19348/contributions/186406/'))
# print(get_poster_info('https://indico.fnal.gov/event/19348/contributions/186356/'))



In [10]:
current_line = 0

if current_line == 0:
    all_posters['posters'] = []

start = datetime.datetime.now()
   
driver.switch_to.window(driver.window_handles[0])
    
for i in range(current_line, len(poster_table_lines)):

    current_line = i

    row = poster_table_lines[i]

    id = -1
    poster_page = None
    title = 'None'
    presenters = 'None'
    poster_link = 'None'
    video_link = 'None'
    all_links = []
    primary_authors = []
    co_authors = []
    mini_abstract = ''
    collaboration = ''
    abstract_url = ''
    
    for j, cell in enumerate(row.find_elements_by_tag_name('td')):
#         print(cell.text)
        #
        # Poster ID
        #
        if j == 1:
            try:
                id = cell.find_element_by_class_name('vertical-aligner').get_attribute('innerHTML')
                print('Fetching Poster ID:', id)
            except:
                id = -2
                print('WARNING: Cannot find id name for')
        #
        # Poster Page Link and Title
        #
        if j == 3:
            try:
                poster_page = cell.find_element_by_css_selector("[href]").get_attribute('href')
                title = cell.find_element_by_css_selector("[href]").get_attribute('innerHTML')
            except Exception as err:
                print('WARNING: Cannot find poster page link:', err)
        #
        # Presenters
        #
        if j == 6:
            try:
                presenters = cell.find_element_by_class_name('person-row').get_attribute('innerHTML')
            except:
                presenters = 'None'
                print('WARNING: Cannot find author name for')
#             print('Presenters:', presenters)
        #
        # Materials
        #
        if j == 11:
            material = cell.find_element_by_class_name('icon-attachment')
            if material.text == 'None':
                link = 'None'
            else:
                material.click()
                poster_link, video_link, all_links = get_materials_link()
                close_btn = driver.find_elements_by_class_name('ui-dialog-titlebar-close')
                close_btn[0].click()
                
#             print('Link:', poster_link)
      
    # 
    # Authors, Miniabstract, collaboration, abstract url, poster_session
    #
    if poster_page is not None:
        primary_authors, co_authors, mini_abstract, collaboration, abstract_url, poster_session = get_poster_info(poster_page)
        
            
    if presenters is not 'None':
        all_posters['posters'].append({'id': id,
                                       'title': title,
                                       'presenters': presenters,
                                       'primary_authors': ','.join(primary_authors),
                                       'co_authors': ','.join(co_authors),
                                       'collaboration': collaboration,
                                       'mini_abstract': mini_abstract,
                                       'poster_link': poster_link,
                                       'video_link': video_link,
                                       'all_links': ','.join(all_links),
                                       'abstract': 'null',
                                       'track': 'null',
                                       'category': 'null',
                                       'abstract_url': abstract_url,
                                       'poster_session': poster_session
                                      })
    
    # Wait for completion
    time.sleep(0.5)

    if (i > 10 and _debug):
        break
        
elapsed = (datetime.datetime.now() - start).total_seconds()
print('Time to complete:', elapsed, 's or', elapsed/60., ' minutes.')

Fetching Poster ID: 4
Fetching Poster ID: 5
Fetching Poster ID: 6
Fetching Poster ID: 8
Fetching Poster ID: 9
Fetching Poster ID: 11
Fetching Poster ID: 12
Fetching Poster ID: 13
Fetching Poster ID: 14
Fetching Poster ID: 15
Fetching Poster ID: 16
Fetching Poster ID: 17
Fetching Poster ID: 18
Fetching Poster ID: 19
Fetching Poster ID: 20
Fetching Poster ID: 21
Fetching Poster ID: 22
Fetching Poster ID: 24
Fetching Poster ID: 26
Fetching Poster ID: 27
Fetching Poster ID: 28
Fetching Poster ID: 29
Fetching Poster ID: 35
Fetching Poster ID: 36
Fetching Poster ID: 37
Fetching Poster ID: 38
Fetching Poster ID: 40
Fetching Poster ID: 41
Fetching Poster ID: 42
Fetching Poster ID: 43
Fetching Poster ID: 44
Fetching Poster ID: 45
Fetching Poster ID: 46
Fetching Poster ID: 47
Fetching Poster ID: 48
Fetching Poster ID: 49
Fetching Poster ID: 50
Fetching Poster ID: 51
Fetching Poster ID: 52
Fetching Poster ID: 53
Fetching Poster ID: 54
Fetching Poster ID: 55
Fetching Poster ID: 56
Fetching Poster 

Fetching Poster ID: 364
Fetching Poster ID: 365
Fetching Poster ID: 366
Fetching Poster ID: 367
Fetching Poster ID: 368
Fetching Poster ID: 369
Fetching Poster ID: 370
Fetching Poster ID: 371
Fetching Poster ID: 372
Fetching Poster ID: 373
Fetching Poster ID: 374
Fetching Poster ID: 375
Fetching Poster ID: 376
Fetching Poster ID: 377
Fetching Poster ID: 378
Fetching Poster ID: 379
Fetching Poster ID: 380
Fetching Poster ID: 381
Fetching Poster ID: 382
Fetching Poster ID: 383
Fetching Poster ID: 384
Fetching Poster ID: 385
Fetching Poster ID: 386
Fetching Poster ID: 387
Fetching Poster ID: 388
Fetching Poster ID: 389
Fetching Poster ID: 390
Fetching Poster ID: 391
Fetching Poster ID: 392
Fetching Poster ID: 393
Fetching Poster ID: 395
Fetching Poster ID: 396
Fetching Poster ID: 397
Fetching Poster ID: 398
Fetching Poster ID: 400
Fetching Poster ID: 401
Fetching Poster ID: 402
Fetching Poster ID: 403
Fetching Poster ID: 404
Fetching Poster ID: 405
Fetching Poster ID: 406
Fetching Poster 

## Write to JSON file

In [11]:
with open(_out_posters_json_file, 'w') as outfile:
    json.dump(all_posters, outfile, indent=4)

## Write to CSV file

In [12]:
# now we will open a file for writing 
data_file = open(_out_posters_csv_file, 'w') 

# create the csv writer object 
csv_writer = csv.writer(data_file)

# Counter variable used for writing  
# headers to the CSV file 
count = 0
  
for element in all_posters['posters']: 

    if count == 0: 
        # Writing headers of CSV file 
        header = element.keys() 
        csv_writer.writerow(header) 
        count += 1
  
    # Writing data of CSV file 
    csv_writer.writerow(element.values()) 
  
    
data_file.close()

## Download all posters

In [272]:
request_cookies_browser = driver.get_cookies()
session = requests.Session()
c = [session.cookies.set(c['name'], c['value']) for c in request_cookies_browser]

# response = s.get('https://indico.fnal.gov/event/19348/contributions/186677/attachments/129251/156732/nu2020_preSNnu_huiling.pdf') #I get a 200 status_code

# with open(f'{_poster_dir}/TEST_POSTER.pdf', 'wb') as f:
#     f.write(response.content)

In [273]:
os.system(f'mkdir -p {_poster_dir}')

n_bad = 0
n_total = 0

for p in all_posters['posters']:
    poster_id = p['id']
    link = p['poster_link']
    
    n_total += 1
        
    if int(poster_id) < 0:
        print('No poster id!')
        continue
        
    if link == "None" or link == "NotFound":
        print('No link for poster with id', poster_id)
        n_bad += 1
        continue
        
    print('Downloading poster with id', poster_id)

    response = session.get(link)
    with open(f'{_poster_dir}/poster_id_{poster_id}.pdf', 'wb') as f:
        f.write(response.content)

No link for poster with id 4
Downloading poster with id 5
Downloading poster with id 6
No link for poster with id 8
Downloading poster with id 9
Downloading poster with id 11
Downloading poster with id 12
Downloading poster with id 13
Downloading poster with id 14
Downloading poster with id 15
Downloading poster with id 16
Downloading poster with id 17
Downloading poster with id 18
Downloading poster with id 19
Downloading poster with id 20
No link for poster with id 21
Downloading poster with id 22
Downloading poster with id 24
Downloading poster with id 26
Downloading poster with id 27
Downloading poster with id 28
Downloading poster with id 29
Downloading poster with id 35
Downloading poster with id 36
Downloading poster with id 37
No link for poster with id 38
Downloading poster with id 40
Downloading poster with id 41
Downloading poster with id 42
Downloading poster with id 43
Downloading poster with id 44
Downloading poster with id 45
No link for poster with id 46
No link for pos

Downloading poster with id 299
Downloading poster with id 300
Downloading poster with id 301
Downloading poster with id 302
Downloading poster with id 303
Downloading poster with id 304
Downloading poster with id 305
Downloading poster with id 306
Downloading poster with id 307
No link for poster with id 308
Downloading poster with id 309
Downloading poster with id 310
Downloading poster with id 311
Downloading poster with id 312
Downloading poster with id 313
Downloading poster with id 314
Downloading poster with id 315
Downloading poster with id 316
Downloading poster with id 317
Downloading poster with id 318
Downloading poster with id 319
Downloading poster with id 320
Downloading poster with id 321
Downloading poster with id 322
No link for poster with id 323
Downloading poster with id 324
Downloading poster with id 325
No link for poster with id 326
No link for poster with id 327
Downloading poster with id 328
Downloading poster with id 329
Downloading poster with id 330
Download

Downloading poster with id 569
Downloading poster with id 570
Downloading poster with id 571
No link for poster with id 572
Downloading poster with id 573
Downloading poster with id 574
Downloading poster with id 575
Downloading poster with id 576
Downloading poster with id 577
Downloading poster with id 578
Downloading poster with id 579
Downloading poster with id 580
Downloading poster with id 581
Downloading poster with id 582
Downloading poster with id 583
Downloading poster with id 584
No link for poster with id 585
Downloading poster with id 586
Downloading poster with id 587
Downloading poster with id 588
No link for poster with id 589
No link for poster with id 590
No link for poster with id 591
No link for poster with id 592
No link for poster with id 593
No link for poster with id 594
No link for poster with id 595
Downloading poster with id 597
Downloading poster with id 600
Downloading poster with id 609
Downloading poster with id 610
Downloading poster with id 611
Download

In [291]:
print(n_bad, 'posters don\'t have links out of', n_total, ' (', float(n_bad/n_total*100.), ' %)')

91 posters don't have links out of 582  ( 15.63573883161512  %)
