# Downloads all posters from Indico

This script used Selenium to fetch all the info from Indico.

Make sure you have downloaded the Firefox geckodriver and it is in your `PATH`.

Also make sure you have exported your SSO password as env variable `SSO_PWD` and your SSO username as `SSO_USER` before starting this notebook.

To download geckodriver:

- Go to: https://github.com/mozilla/geckodriver/releases
- Download: geckodriver-v0.26.0-macos.tar.gz
- Place the geckodriver in some folder and make sure that folder is in your PATH.

In [1]:
import time
import random
import os
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import json

Some settable variables:

In [2]:
# Debug flag
_debug = False

# Name of the output json file
_out_posters_json_file = 'posters.json'

# Where all posters will be dowloaded
_poster_dir = 'posters/'

In [3]:
# uploaded_posters_ids = []
all_posters = {}
all_posters['posters'] = []

## Open Session
Open an new webdriver session, go to Indico and sign in.

In [4]:
# Open indico
driver = webdriver.Firefox()
driver.get("https://indico.fnal.gov/event/19348/manage/contributions/") # url for our conference
# assert "INDICO" in driver.title

# Log-in via SSO
sso_btn = driver.find_element_by_class_name('external-provider-shib-sso')
sso_btn.click()

continue_btn = driver.find_element_by_class_name('ping-button')
continue_btn.click()

inputUser = driver.find_element_by_id('username')
inputUser.clear()
inputUser.send_keys(os.getenv('SSO_USER'))

inputPassword = driver.find_element_by_id('password')
inputPassword.clear()
inputPassword.send_keys(os.getenv('SSO_PWD'))
# inputPassword.send_keys('3Razumichin')
inputPassword.send_keys(Keys.RETURN)

Next two cells was me playing around, we don't need to execute it.

In [5]:
%%bash -c : # This prevents cell from being executed

poster_table = driver.find_element_by_css_selector('table')
poster_table_lines = poster_table.find_elements_by_css_selector('tr')
# postertable[20].get_attribute('innerHTML')
# postertable[3].find_elements_by_tag_name('td')[3].get_attribute('innerHTML')
print('Number of posters', len(poster_table_lines))

poster_table_lines[1].find_elements_by_tag_name('td')[3].get_attribute('innerHTML')

cell = poster_table_lines[568].find_elements_by_tag_name('td')[6]
print(cell.get_attribute('innerHTML'))
cell.find_element_by_class_name('person-row').get_attribute('innerHTML')

cell = poster_table_lines[568].find_elements_by_tag_name('td')[11]
print(cell.get_attribute('innerHTML'))
material = cell.find_element_by_class_name('icon-attachment')
material.click()

In [6]:
%%bash -c : # This prevents cell from being executed
material_table = driver.find_element_by_class_name('tree')
material_table_lines = material_table.find_elements_by_css_selector('tr')
# postertable[20].get_attribute('innerHTML')
print('Number of materials', len(material_table_lines))
for row in material_table_lines:
    # The file name is the zero index td cell:
    cell = row.find_elements_by_tag_name('td')[0]
    print(cell.text)
    if '.pdf' not in cell.text:
        print('Not a PDF file!')
    material_link = cell.find_element_by_css_selector("[href]").get_attribute('href')
    print(material_link)
btn = driver.find_elements_by_class_name('ui-dialog-titlebar-close')
btn[0].click()

In [7]:
def get_materials_link():
    '''
    With a material pop-up window opened,
    this function reads all the materials uploaded
    and picks the first PDF file found.
    TODO: do we want the first? Maybe need to define
    another criterion
    
    returns:
    - file name
    - link to file
    '''
    material_table = driver.find_element_by_class_name('tree')
    material_table_lines = material_table.find_elements_by_css_selector('tr')
    
    material_link = 'NotFound'
    file_name = 'NotFound'

    print('Number of materials', len(material_table_lines))
    for row in material_table_lines:
        # The file name is the zero index td cell:
        cell = row.find_elements_by_tag_name('td')[0]
        if _debug: print(cell.text)
        if '.pdf' in cell.text:
            file_name = cell.text
            material_link = cell.find_element_by_css_selector("[href]").get_attribute('href')
            if _debug: print(material_link)
            break
    return file_name, material_link

Get the table with all the materials, and also get all the lines of this table.

In [11]:
poster_table = driver.find_element_by_css_selector('table')
poster_table_lines = poster_table.find_elements_by_css_selector('tr')

Loop over all the contribution table lines, and for each of them get:
- (element 6): the author name
- (element 11): the link to the poster

In [12]:
for i, row in enumerate(poster_table_lines):
    author = 'None'
    link = 'None'
    file_name = 'None'
    
    for j, cell in enumerate(row.find_elements_by_tag_name('td')):
        # print(cell.text)
        if j == 6:
            try:
                author = cell.find_element_by_class_name('person-row').get_attribute('innerHTML')
            except:
                author = 'None'
                print('WARNING: Cannot find author name for')
            print('Author:', author)
            
        if j == 11:
            material = cell.find_element_by_class_name('icon-attachment')
            if material.text == 'None':
                link = 'None'
            else:
                material.click()
                file_name, link = get_materials_link()
                close_btn = driver.find_elements_by_class_name('ui-dialog-titlebar-close')
                close_btn[0].click()
                
            print('Link:', link)
            
    if author is not 'None':
        all_posters['posters'].append({'author': author,
                                       'file_name': file_name,
                                       'file_link': link
                                      })

    if (i > 10 and _debug):
        break

Author: Avinay Bhat
Link: None
Author: ITISHREE SETHI
Link: None
Author: Marcos Dracos
Link: None
Author: Nick Solomey
Link: None
Author: João Paulo Pinheiro
Link: None
Author: Huiling Li
Link: None
Author: James Kneller
Link: None
Author: Peibo An
Link: None
Author: Chris Rogers
Link: None
Author: Tom Lord
Link: None
Author: Paul Jurj
Link: None
Author: Craig Brown
Link: None
Author: Vincent Cecchini
Link: None
Author: Tamer Tolba
Link: None
Author: Jan Behrens
Link: None
Author: Lukas Hauertmann
Link: None
Author: Xiang Liu
Link: None
Author: Martin Schuster
Link: None
Author: Osamu Yasuda
Link: None
Author: Jianming Bian
Link: None
Author: Yu-Feng Li
Link: None
Author: Sankagiri Umasankar
Link: None
Author: Wojciech Flieger
Link: None
Author: Justin Evans
Link: None
Author: Jaydeep Datta
Link: None
Author: Florian Priester
Link: None
Author: Mark Anderson
Link: None
Author: Daljeet Kaur
Link: None
Author: José SOTO
Link: None
Author: Ana Gallego Ros
Link: None
Author: Linyan WAN
Lin

Author: Yanina Biondi
Link: None
Author: Daniel Siegmann
Link: None
Author: Xingyu Zhao
Link: None
Author: Andrey Romanov
Link: None
Author: Maxim Gromov
Link: None
Author: Ushak Rahaman
Link: None
Author: Alessio Giarnetti
Link: None
Author: J. Pedro Ochoa
Link: None
Author: Manoj Kumar Singh
Link: None
Author: Elizabeth Friedman
Link: None
Author: Michael Larson
Link: None
Author: Shreyashi Chakdar
Link: None
Author: Ran Itay
Link: None
Author: Davio Cianci
Link: None
Author: Michael Willers
Link: None
Author: Mauricio Bustamante
Link: None
Author: Joseph Smolsky
Link: None
Author: Irina Lomskaya
Link: None
Author: Deepak Tiwari
Link: None
Author: Brenda Aurea Cervantes Vergara
Link: None
Author: yaping cheng
Link: None
Author: Hiroshi Ogawa
Link: None
Author: Lu Ren
Link: None
Author: Damini Singh
Link: None
Author: Laura Zambelli
Link: None
Author: Guido Fantini
Link: None
Author: Matteo Cerruti
Link: None
Author: Aldo Ianni
Link: None
Author: Etienne Chardonnet
Link: None
Author: 

Author: Casandra Morris
Link: None
Author: Ka Vang Tsang
Link: None
Author: Gabriela Vitti Stenico
Link: None
Author: Doga Veske
Link: None
Author: Russell Neilson
Link: None
Author: Benjamin Foust
Link: None
Author: Jarrett Moon
Link: None
Author: Talia Weiss
Link: None
Author: Eliza Gazda
Link: None
Author: Karla Tellez-Giron-Flores
Link: None
Author: Dario Rodrigues
Link: None
Author: Victor Buridon
Link: None
Author: Ivan Pedro Sidelnik
Link: None
Author: Bryan Ramson
Link: None
Author: Vivek Singh
Link: None
Author: Tim Daniels
Link: None
Author: XIANYI ZHANG
Link: None
Author: Tyler Rehak
Link: None
Author: Marjon Moulai
Link: None
Author: Behzad Hosseini
Link: None
Author: Olivia Dalager
Link: None
Author: Ankur Nath
Link: None
Author: Lorna Nolan
Link: None
Author: David Henaff
Link: None
Author: Deepika Jena
Link: None
Author: BENITO VARGAS PEREZ
Link: None
Author: Markus Horn
Link: None
Author: Ryan Dorrill
Link: None
Author: Pranav Dave
Link: None
Author: Pieter Mumm
Link: N

In [20]:
with open(_out_posters_json_file, 'w') as outfile:
    json.dump(all_posters, outfile, indent=4)

## Download all posters

In [18]:
os.system(f'mkdir -p {_poster_dir}')

counter = 0

for p in all_posters['posters']:
    link = p['file_link']
    os.system(f'curl -o {_poster_dir}/poster_{counter} {link}')