In [None]:
!pip install beautifulsoup4
!pip install requests
!pip install scrapy

### Scrapy Selectors

In [1]:
import scrapy
import requests
import pandas as pd
import numpy as np

from scrapy import Selector

In [5]:
url = 'https://eventos.itam.mx/es/evento/seminario-de-perspectivas-economicas-2024'
html = requests.get(url).content

sel = Selector(text = html)
fecha = sel.xpath('//div[@id="fecha-evento"]').extract()
fecha[0]

'<div id="fecha-evento">\t    \r\n\t\t\t\t12 de enero de 2024<br>De 8.00 a 15.30 h                </div>'

In [11]:
import re

def clean_string(input_string):
    # Remove everything inside '<>'
    cleaned_string = re.sub(r'<[^>]+>', ' ', input_string)
    
    # Remove '\' and the letter right next to it
    cleaned_string = re.sub(r'\\.', '', cleaned_string)
    
    # Remove any double spaces
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string)
    
    return cleaned_string.strip()

fecha_clean = clean_string(fecha[0])
fecha_clean

'12 de enero de 2024 De 8.00 a 15.30 h'

In [9]:
evento = sel.xpath('//div[@id="cuerpo-evento"]//p[@dir="ltr" and not(@class="rtecenter")]').extract()

In [12]:
evento_clean = []
for ev in evento:
    event = clean_string(ev)
    evento_clean.append(event)
evento_clean

['Te invitamos a nuestro tradicional Seminario de Perspectivas Económicas 2024.',
 '8.00 h – Bienvenida',
 'Humberto López, Asociación de Ex Alumnos del ITAM, Presidente',
 '8.15 h – Inauguración. Palabras del Rector del Instituto Tecnológico',
 'Autónomo de México',
 'Arturo Fernández, ITAM, Rector',
 '8.30 h – Conferencia magistral - Perspectivas de las Américas ( Zoom )',
 'Ilan Goldfajn, Banco Interamericano de Desarrollo, Presidente',
 '9.00 h – Mesa de pronósticos',
 'Carlos Capistrán, Bank of America, Economista en Jefe para México y Canadá',
 'Ernesto Revilla, Citigroup, Economista en Jefe para América Latina',
 'Alejandrina Salcedo, Banco de México, Directora General de Investigación Económica',
 'Moderador: Miguel Messmacher, ITAM, Director general de la División Académica de Ciencias Sociales',
 '',
 '10:00 h – Mesa de escenarios políticos de las elecciones en México',
 'Luis Carlos Ugalde, Integralia Consultores, Director General',
 'Federico Reyes-Heroles, Transparencia Me

In [59]:
import pandas as pd

people = [element for element in evento_clean if ',' in element]
people_df = pd.DataFrame(people, columns=['name'])
people_df['speaker'] = people_df['name'].apply(lambda x: x.split(',')[0].strip())
people_df['moderator'] = people_df['speaker'].apply(lambda x: 1 if 'Moderador:' in x or 'Moderadora:' in x else 0)
people_df['speaker'] = people_df['speaker'].str.replace('Moderador\\:|Moderadora\\:', '', regex=True).str.strip()
people_df['occupation'] = people_df['name'].str.extract(r',(.*)')
people_df['year'] = 2024
people_df = people_df.drop(['name'], axis = 1)
people_df

Unnamed: 0,speaker,moderator,occupation,year
0,Humberto López,0,"Asociación de Ex Alumnos del ITAM, Presidente",2024
1,Arturo Fernández,0,"ITAM, Rector",2024
2,Ilan Goldfajn,0,"Banco Interamericano de Desarrollo, Presidente",2024
3,Carlos Capistrán,0,"Bank of America, Economista en Jefe para Méxi...",2024
4,Ernesto Revilla,0,"Citigroup, Economista en Jefe para América La...",2024
5,Alejandrina Salcedo,0,"Banco de México, Directora General de Investi...",2024
6,Miguel Messmacher,1,"ITAM, Director general de la División Académi...",2024
7,Luis Carlos Ugalde,0,"Integralia Consultores, Director General",2024
8,Federico Reyes-Heroles,0,"Transparencia Mexicana, Presidente del Consej...",2024
9,Agustín Basave,0,"Universidad de Monterrey, Director del Instit...",2024


In [65]:
def scrape_seminario(year):
    url = f'https://eventos.itam.mx/es/evento/seminario-de-perspectivas-economicas-{year}'
    html = requests.get(url).content
    sel = Selector(text = html)
    evento = sel.xpath('//div[@id="cuerpo-evento"]//p').extract()
    
    evento_clean = []
    for ev in evento:
        event = clean_string(ev)
        evento_clean.append(event)
    
    people = [element for element in evento_clean if ',' in element]
    people_df = pd.DataFrame(people, columns=['name'])
    people_df = people_df[~people_df['name'].str.contains('Cuota de recuperación\\:|reembolso|Perspectives|invitamos|horario|Bienvenida|acompañarnos')]
    people_df['speaker'] = people_df['name'].apply(lambda x: x.split(',')[0].strip())
    people_df['moderator'] = people_df['speaker'].apply(lambda x: 1 if 'Moderador:' in x or 'Moderadora:' in x
                                                        or 'Modera:' in x else 0)
    people_df['speaker'] = people_df['speaker'].str.replace('Moderador\\:|Moderadora\\:|Modera\\:',
                                                            '', regex=True).str.strip()
    people_df['occupation'] = people_df['name'].str.extract(r',(.*)')
    people_df['year'] = year
    people_df = people_df.drop(['name'], axis = 1)
    
    return(people_df)


In [66]:
df_final = pd.DataFrame()
for y in range(2020, 2025):
    df = scrape_seminario(y)
    df_final = pd.concat([df_final, df]).reset_index(drop=True)
    
df_final

Unnamed: 0,speaker,moderator,occupation,year
0,Arturo Fernández P.,0,"ITAM , Rector",2020
1,Alejandro Díaz de León Carrillo,0,"Banco de México, Gobernador",2020
2,Alejandro Werner Wainfeld,0,"FMI, Dir. Depto. del Hemisferio Occidental",2020
3,Iván Moguel,0,"Chévez Ruiz Zamarripa, Socio",2020
4,Lorenza Martinez,1,"Accenture México, Managing Director",2020
...,...,...,...,...
86,Rafael Fernández de Castro,0,Centro de Estudios México-Estados Unidos UCSD...,2024
87,Arturo Sarukhán,0,"Sarukhán y asociados, Presidente",2024
88,Jorge Suárez-Vélez,0,"Allen &amp; Company, Director",2024
89,Ana María Salazar,1,"Grupo Salazar Slack SC, Directora",2024


In [68]:
df_final.to_excel('../data/seminario_attendees.xlsx')

### Bots:

## Recommendations to run this notebook

You might need to install some of the libraries being used, you can do so by running:

```python

!pip install [package_name] --upgrade

```

#### If you have a MAC arm-based series and this code isn't working:

All the tests for this code were run in a MAC M1, installing a webdriver is a littly buggy on M1 so with my current virtual environment this code should work, (with minor issues, i.e., you need to restart the Kernel after the loop is over to run the code again). If you don't manage to make this code work with your won setting you can then install the dependencies I used in a virtual environment on your own computer using:

```python
!pip install -r requirements.txt

```

In [1]:
from selenium import webdriver
import selenium
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

driver = webdriver.Chrome()

#s=Service(ChromeDriverManager().install()) #MAC user might need this
#driver = webdriver.Chrome(service=s)

In [2]:
driver.get('http://twitter.com/login')

In [3]:
import time
# Set up your own password and username:
handle = ''
password = ''

driver.get('http://twitter.com/login')
#driver.maximize_window()
time.sleep(5)
#login
driver.find_element(By.NAME,'text').send_keys(handle)
time.sleep(3)
driver.find_element(By.XPATH, 
                    '//*[@id="layers"]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div/div/div/div[6]/div').click()
time.sleep(3)
driver.find_element(By.XPATH,'//*[@id="layers"]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[1]/div/div/div[3]/div/label/div/div[2]/div[1]/input').send_keys(password)
time.sleep(7)
driver.find_element(By.XPATH, '//*[@id="layers"]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[2]/div/div[1]/div/div/div/div').click()
time.sleep(3)

In [4]:
driver.get(f'https://twitter.com/HLarreguy')
time.sleep(1)
profile = driver.find_elements(By.XPATH, '//span[@data-testid="UserJoinDate"]')
for p in profile:
    Date = p.text
Date

'Joined February 2021'

## Scraping the Trending Topics of Today

In [5]:
driver.get(f'https://twitter.com/explore/tabs/trending')

time.sleep(5)

trend = driver.find_elements(By.XPATH,
                             '//div[@data-testid="cellInnerDiv" and not(@style="transform: translateY(0px); position: absolute; width: 100%;")]/div/div/div/div')

Name = []
posts = []
for t in trend:
    Name1 = t.find_element(By.XPATH, './/div[@style="text-overflow: unset; color: rgb(15, 20, 25);"]').text
    Name.append(Name1)
    
    posts1 = t.find_element(By.XPATH, './div[3]').text
    posts.append(posts1)

df = pd.DataFrame(zip(Name, posts), columns=['trend','n_posts'])
df.to_excel('../data/trends_18_01_2024.xlsx')

NameError: name 'pd' is not defined

In [17]:
df

Unnamed: 0,trend,n_posts
0,#ConClaudiaGanamos,"4,291 posts"
1,Paramore,19.8K posts
2,#FelizJueves,10.9K posts
3,HOY SE ESTRENA BOBO,11.9K posts
4,Unionistas,33.2K posts
5,Milei,768K posts
6,Andrés Guardado,"5,907 posts"
7,Davos,816K posts
8,Balde,14.1K posts
9,MAÑANA BOBO EN SPOTIFY,42.7K posts


In [44]:
# Scrolling down: LOOK AT THE GOOGLE CHROME TAB TO SEE MY TWEETS :)
driver.get(f'https://twitter.com/ZagoZaguinho1/with_replies')
scroll_distance = 2500
scroll_pos=[1]
i = 0
t = 0
while True:
    time.sleep(2) # ESTA LINEA PERMITE QUE EL CÓDIGO SE DETENGA DOS SEGUNDOS, PARA NO TENER PROBLEMAS DE INTERNET, ETC.
    current_scroll_pos = driver.execute_script("return window.scrollY;") # REGRESA LA POSICIÓN EN LA QUE ESTAMOS EN LA PAG.
    scroll_pos.append(current_scroll_pos) # LA GUARDA EN UNA LISTA
    
    driver.execute_script(f'window.scrollBy(0, {scroll_distance});') # SCROLLEAMOS
    time.sleep(5)
    t+=1 # AVANZAMOS EL CONTADOR == t = t + 1
    print('Scroll ', i)
    print(scroll_pos[t-1], scroll_pos[t])
    if (scroll_pos[t-1] == scroll_pos[t]): # ESTA CONDICIÓN NOS PERMITIRÁ SALIRNOS DEL LOOP
        print("you've reached the end") # CUANDO LA POSICIÓN DE LA PAGINA SEA IGUAL ENTRE PERIODOS, AHI SALE DEL WHILE
        break 
    
    i+=1
    

Scroll  0
1 0
Scroll  1
0 2500
Scroll  2
2500 5000
Scroll  3
5000 7500
Scroll  4
7500 10000
Scroll  5
10000 12500
Scroll  6
12500 15000
Scroll  7
15000 17500
Scroll  8
17500 20000
Scroll  9
20000 22500
Scroll  10
22500 23589.599609375
Scroll  11
23589.599609375 26089.599609375
Scroll  12
26089.599609375 28589.599609375
Scroll  13
28589.599609375 31008.80078125
Scroll  14
31008.80078125 31176
Scroll  15
31176 31176
you've reached the end


In [49]:
UserTags=[]
driver.get(f'https://twitter.com/ZagoZaguinho1/with_replies')
scroll_distance = 2500
TimeStamps=[]
Tweets=[]
scroll_pos=[1]
old_tweet='new'
rate='good'
    
articles = driver.find_elements(By.XPATH,"//article[@data-testid='tweet']") # INICIALIZAMOS EL HTML, CON LA PRIMERA PAGINA SIN SCROLLEAR
t=0 
while True:
    time.sleep(2)
    current_scroll_pos = driver.execute_script("return window.scrollY;") 
    scroll_pos.append(current_scroll_pos)
    types = "0"
        
    for article in articles: # SCRAPEAMOS LA PAGINA EN LA QUE VAMOS DEL SCROLLEO
        
        try: # TRY-EXCEPT NOS SIRVE PARA QUE EL CÓDIGO NO SE DETENGA CON ERRORES (NO ENCUENTRA ESE ELEMENTO EN LA PAGINA, REGRESA NA)
            UserTag = article.find_element(By.XPATH,".//div[@data-testid='User-Name']").text
            UserTags.append(UserTag)
        except:
            UserTags.append(np.nan)
        try:
            TimeStamp = article.find_element(By.XPATH,".//time").get_attribute('datetime')
            TimeStamps.append(TimeStamp)
        except:
            TimeStamps.append(np.nan)

        try:
            Tweet = article.find_element(By.XPATH,".//div[@data-testid='tweetText']").text
            Tweets.append(Tweet)
        except:
            Tweets.append(np.nan)


    driver.execute_script(f'window.scrollBy(0, {scroll_distance});') # SCROLL A LA SIGUIENTE PAGINA
    time.sleep(5)
    articles = driver.find_elements(By.XPATH,"//article[@data-testid='tweet']") # SCRAPEA LOS ELEMENTOS DE ESA PAGINA
    Tweets2 = list(set(Tweets))
    t+=1

    print(scroll_pos[t-1], scroll_pos[t])
    if (scroll_pos[t-1] == scroll_pos[t]):
        print("you've reached the end")
        break

df = pd.DataFrame(zip(UserTags,TimeStamps,Tweets),
                  columns=['username','TimeStamp','text'])

1 0
0 2784
2784 5284
5284 7784
7784 10284
10284 12048
12048 14548
14548 17048
17048 19548
19548 22048
22048 24548
24548 27048
27048 29548
29548 30593.599609375
30593.599609375 30593.599609375
you've reached the end


In [50]:
df

Unnamed: 0,username,TimeStamp,text
0,Zago\n@ZagoZaguinho1\n·\n17h,2024-01-18T06:55:59.000Z,"También tu última comparación es engañosa, com..."
1,Xóchitl Gálvez Ruiz\n@XochitlGalvez\n·\nJan 17,2024-01-17T18:04:42.000Z,Por su boca muere el pez.\n\n#MéxicoVaConX
2,Zago\n@ZagoZaguinho1\n·\nJan 17,2024-01-17T21:26:17.000Z,Hahahahaha arte!!!
3,Arturo Ángel\n@arturoangel20\n·\nJan 17,2024-01-17T17:09:45.000Z,Este gráfico está tan mal en tantos aspectos m...
4,Zago\n@ZagoZaguinho1\n·\nJan 17,2024-01-17T18:51:14.000Z,A que no puedes nombrar ni uno
...,...,...,...
142,"Zago\n@ZagoZaguinho1\n·\nDec 22, 2023",2023-12-22T20:08:57.000Z,Aunque algo ha de haber ayudado! Parece ser qu...
143,"Emilio Gutierrez\n@emiliogf\n·\nDec 22, 2023",2023-12-22T15:50:01.000Z,Feliz de que ya está disponible en línea este ...
144,"Zago\n@ZagoZaguinho1\n·\nDec 22, 2023",2023-12-22T19:29:42.000Z,Muchas felicidades!
145,"NBA\n@NBA\n·\nDec 21, 2023",2023-12-22T04:39:16.000Z,"""I don’t really know where that came from.” \n..."


In [52]:
UserTags=[]
driver.get(f'https://twitter.com/ZagoZaguinho1/with_replies')
scroll_distance = 2500
TimeStamps=[]
from datetime import datetime
reference_date = datetime(2024, 1, 1) 
Tweets=[]
scroll_pos=[1]
old_tweet='new'
rate='good'
    
articles = driver.find_elements(By.XPATH,"//article[@data-testid='tweet']")
t=0 
while True:
    time.sleep(2)
    current_scroll_pos = driver.execute_script("return window.scrollY;")
    scroll_pos.append(current_scroll_pos)
    types = "0"
        
    for article in articles:
        
        try:
            UserTag = article.find_element(By.XPATH,".//div[@data-testid='User-Name']").text
            UserTags.append(UserTag)
        except:
            UserTags.append(np.nan)
        try:
            TimeStamp = article.find_element(By.XPATH,".//time").get_attribute('datetime')
            TimeStamps.append(TimeStamp)
        except:
            TimeStamps.append(np.nan)

        try:
            Tweet = article.find_element(By.XPATH,".//div[@data-testid='tweetText']").text
            Tweets.append(Tweet)
        except:
            Tweets.append(np.nan)
        
        if ((datetime.fromisoformat(TimeStamp[:-1]) < reference_date)): # ESTA CONDICION NOS PERMITE DETERNOS EN CIERTA FECHA
                old_tweet = 'old' # LA VARIABLE ESTABA INICIALIZADA COMO 'new' POR LO QUE CUANDO CUMPLE LA CONDICION SE CAMBIA
                print("you've reached a tweet older than the reference tweet")


    driver.execute_script(f'window.scrollBy(0, {scroll_distance});')
    time.sleep(5)
    articles = driver.find_elements(By.XPATH,"//article[@data-testid='tweet']")
    Tweets2 = list(set(Tweets))
    t+=1

    print(scroll_pos[t-1], scroll_pos[t])
    if (scroll_pos[t-1] == scroll_pos[t]) or (old_tweet == 'old'): # ACÁ ES DONDE AGREGAMOS LA CONDICION DE ANTES
        print("you've reached the end")
        break # SE SALE AHORA SI LLEGAS AL FINAL O SI ENCUENTRAS UN TWEET DESPUES DE LA FECHA DE REFERENCIA

df = pd.DataFrame(zip(UserTags,TimeStamps,Tweets),
                  columns=['username','TimeStamp','text'])

1 0
0 2500
2500 5000
5000 7500
7500 10000
10000 11995.2001953125
11995.2001953125 14495.2001953125
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
you've reached a tweet older than the refrence tweet
14495.2001953125 16995.19921875
you've reached the end


In [53]:
df

Unnamed: 0,username,TimeStamp,text
0,El Sabueso\n@ElSabuesoAP\n·\nJan 17,2024-01-18T00:18:20.000Z,En los cinco primeros años del gobierno de Lóp...
1,Zago\n@ZagoZaguinho1\n·\n17h,2024-01-18T06:55:59.000Z,"También tu última comparación es engañosa, com..."
2,Xóchitl Gálvez Ruiz\n@XochitlGalvez\n·\nJan 17,2024-01-17T18:04:42.000Z,Por su boca muere el pez.\n\n#MéxicoVaConX
3,Zago\n@ZagoZaguinho1\n·\nJan 17,2024-01-17T21:26:17.000Z,Hahahahaha arte!!!
4,Arturo Ángel\n@arturoangel20\n·\nJan 17,2024-01-17T17:09:45.000Z,Este gráfico está tan mal en tantos aspectos m...
...,...,...,...
79,"Zago\n@ZagoZaguinho1\n·\nDec 31, 2023",2023-12-31T19:06:56.000Z,"Innegable, ahora si..."
80,"GONZALO OLIVEROS\n@goliveros\n·\nDec 30, 2023",2023-12-31T03:57:08.000Z,Solo una cosa \n\nLa tasa de interés en México...
81,"GONZALO OLIVEROS\n@goliveros\n·\nDec 31, 2023",2023-12-31T13:50:03.000Z,"Muy parecida a la mexicana, con una situación ..."
82,"Zago\n@ZagoZaguinho1\n·\nDec 31, 2023",2023-12-31T19:02:00.000Z,"Bien Gonzalo, llegaste inadvertidamente al punto"
