In [1]:
!pip install beautifulsoup4
!pip install requests
!pip install scrapy



### Scrapy Selectors

In [2]:
import scrapy
import requests
import pandas as pd
import numpy as np

from scrapy import Selector

In [3]:
url = 'https://eventos.itam.mx/es/evento/seminario-de-perspectivas-economicas-2024'
html = requests.get(url).content

sel = Selector(text = html)
fecha = sel.xpath('//div[@id="fecha-evento"]').extract() 

In [4]:
import re

def clean_string(input_string):
    # Remove everything inside '<>'
    cleaned_string = re.sub(r'<[^>]+>', ' ', input_string)
    
    # Remove '\' and the letter right next to it
    cleaned_string = re.sub(r'\\.', '', cleaned_string)
    
    # Remove any double spaces
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string)
    
    return cleaned_string.strip()

fecha_clean = clean_string(fecha[0])
fecha_clean

'12 de enero de 2024 De 8.00 a 15.30 h'

In [5]:
evento = sel.xpath('//div[@id="cuerpo-evento"]//p[@dir="ltr" and not(@class="rtecenter")]').extract()

In [6]:
evento_clean = []
for ev in evento:
    event = clean_string(ev)
    evento_clean.append(event)
evento_clean

['Te invitamos a nuestro tradicional Seminario de Perspectivas Económicas 2024.',
 '8.00 h – Bienvenida',
 'Humberto López, Asociación de Ex Alumnos del ITAM, Presidente',
 '8.15 h – Inauguración. Palabras del Rector del Instituto Tecnológico',
 'Autónomo de México',
 'Arturo Fernández, ITAM, Rector',
 '8.30 h – Conferencia magistral - Perspectivas de las Américas ( Zoom )',
 'Ilan Goldfajn, Banco Interamericano de Desarrollo, Presidente',
 '9.00 h – Mesa de pronósticos',
 'Carlos Capistrán, Bank of America, Economista en Jefe para México y Canadá',
 'Ernesto Revilla, Citigroup, Economista en Jefe para América Latina',
 'Alejandrina Salcedo, Banco de México, Directora General de Investigación Económica',
 'Moderador: Miguel Messmacher, ITAM, Director general de la División Académica de Ciencias Sociales',
 '',
 '10:00 h – Mesa de escenarios políticos de las elecciones en México',
 'Luis Carlos Ugalde, Integralia Consultores, Director General',
 'Federico Reyes-Heroles, Transparencia Me

In [7]:
import pandas as pd

people = [element for element in evento_clean if ',' in element]
people_df = pd.DataFrame(people, columns=['name'])
people_df['speaker'] = people_df['name'].apply(lambda x: x.split(',')[0].strip())
people_df['moderator'] = people_df['speaker'].apply(lambda x: 1 if 'Moderador:' in x or 'Moderadora:' in x else 0)
people_df['speaker'] = people_df['speaker'].str.replace('Moderador\\:|Moderadora\\:', '', regex=True).str.strip()
people_df['occupation'] = people_df['name'].str.extract(r',(.*)')
people_df['year'] = 2024
people_df = people_df.drop(['name'], axis = 1)
people_df

Unnamed: 0,speaker,moderator,occupation,year
0,Humberto López,0,"Asociación de Ex Alumnos del ITAM, Presidente",2024
1,Arturo Fernández,0,"ITAM, Rector",2024
2,Ilan Goldfajn,0,"Banco Interamericano de Desarrollo, Presidente",2024
3,Carlos Capistrán,0,"Bank of America, Economista en Jefe para Méxi...",2024
4,Ernesto Revilla,0,"Citigroup, Economista en Jefe para América La...",2024
5,Alejandrina Salcedo,0,"Banco de México, Directora General de Investi...",2024
6,Miguel Messmacher,1,"ITAM, Director general de la División Académi...",2024
7,Luis Carlos Ugalde,0,"Integralia Consultores, Director General",2024
8,Federico Reyes-Heroles,0,"Transparencia Mexicana, Presidente del Consej...",2024
9,Agustín Basave,0,"Universidad de Monterrey, Director del Instit...",2024


In [8]:
def scrape_seminario(year):
    url = f'https://eventos.itam.mx/es/evento/seminario-de-perspectivas-economicas-{year}'
    html = requests.get(url).content
    sel = Selector(text = html)
    evento = sel.xpath('//div[@id="cuerpo-evento"]//p').extract()
    
    evento_clean = []
    for ev in evento:
        event = clean_string(ev)
        evento_clean.append(event)
    
    people = [element for element in evento_clean if ',' in element]
    people_df = pd.DataFrame(people, columns=['name'])
    people_df = people_df[~people_df['name'].str.contains('Cuota de recuperación\\:|reembolso|Perspectives|invitamos|horario|Bienvenida|acompañarnos')]
    people_df['speaker'] = people_df['name'].apply(lambda x: x.split(',')[0].strip())
    people_df['moderator'] = people_df['speaker'].apply(lambda x: 1 if 'Moderador:' in x or 'Moderadora:' in x
                                                        or 'Modera:' in x else 0)
    people_df['speaker'] = people_df['speaker'].str.replace('Moderador\\:|Moderadora\\:|Modera\\:',
                                                            '', regex=True).str.strip()
    people_df['occupation'] = people_df['name'].str.extract(r',(.*)')
    people_df['year'] = year
    people_df = people_df.drop(['name'], axis = 1)
    
    return(people_df)


In [9]:
df_final = pd.DataFrame()
for y in range(2020, 2025):
    df = scrape_seminario(y)
    df_final = pd.concat([df_final, df]).reset_index(drop=True)
    
df_final

Unnamed: 0,speaker,moderator,occupation,year
0,Arturo Fernández P.,0,"ITAM , Rector",2020
1,Alejandro Díaz de León Carrillo,0,"Banco de México, Gobernador",2020
2,Alejandro Werner Wainfeld,0,"FMI, Dir. Depto. del Hemisferio Occidental",2020
3,Iván Moguel,0,"Chévez Ruiz Zamarripa, Socio",2020
4,Lorenza Martinez,1,"Accenture México, Managing Director",2020
...,...,...,...,...
86,Rafael Fernández de Castro,0,Centro de Estudios México-Estados Unidos UCSD...,2024
87,Arturo Sarukhán,0,"Sarukhán y asociados, Presidente",2024
88,Jorge Suárez-Vélez,0,"Allen &amp; Company, Director",2024
89,Ana María Salazar,1,"Grupo Salazar Slack SC, Directora",2024


### Bots:

## Recommendations to run this notebook

# Complete Setup Instructions for Python Project

## 1. Create a Virtual Environment
To isolate your project dependencies:
- Open your terminal.
- Navigate to your project directory:
  ```bash
  cd /Users/joseperez/Documents/GitHub/python-statlearning-lab/week_2/code

  python3 -m venv venv

  source venv/bin/activate #on mac
  venv\Scripts\activate #on windows

  pip install --upgrade pip setuptools wheel

  pip install -r requirements.txt

  brew install hdf5
  pip install h5py

  pip install spacy
  python -m spacy download en_core_web_sm
  python -m spacy download es_core_news_lg
  python -m spacy download es_core_news_sm
  pip3 install murmurhash

Deactivate the Virtual Environment
    ```bash
    deactivate

In [13]:
import os

In [14]:
print("Current working directory:", os.getcwd())

Current working directory: /Users/joseperez/Documents/GitHub/python-statlearning-lab/week_3/code


In [None]:
from selenium import webdriver
import selenium
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

driver = webdriver.Chrome()

#s=Service(ChromeDriverManager().install()) #MAC user might need this
#driver = webdriver.Chrome(service=s)

In [26]:
driver.get('http://twitter.com/login')

In [17]:
import time


In [18]:
# Open the Twitter profile
driver = webdriver.Chrome()
driver.get('https://twitter.com/HLarreguy')
time.sleep(1)

# Find elements with the specified XPath
profile = driver.find_elements(By.XPATH, '//span[@data-testid="UserJoinDate"]')

# Initialize Date variable
Date = None

# Extract the text if elements are found
if profile:
    for p in profile:
        Date = p.text  # This will take the text of the last matched element
else:
    print("No elements found with the specified XPath.")

# Print the result or a message if Date remains undefined
if Date:
    print("User join date:", Date)
else:
    print("Join date not found.")

# Close the driver
driver.quit()

No elements found with the specified XPath.
Join date not found.


## Scraping the Trending Topics of Today

In [20]:
driver = webdriver.Chrome()
driver.get(f'https://twitter.com/explore/tabs/trending')

time.sleep(5)

trend = driver.find_elements(By.XPATH,
                             '//div[@data-testid="cellInnerDiv" and not(@style="transform: translateY(0px); position: absolute; width: 100%;")]/div/div/div/div')

Name = []
posts = []
for t in trend:
    Name1 = t.find_element(By.XPATH, './/div[@style="text-overflow: unset; color: rgb(15, 20, 25);"]').text
    Name.append(Name1)
    
    posts1 = t.find_element(By.XPATH, './div[3]').text
    posts.append(posts1)

df = pd.DataFrame(zip(Name, posts), columns=['trend','n_posts'])
df.to_excel('../data/trends_18_01_2024.xlsx')

ModuleNotFoundError: No module named 'openpyxl'

In [17]:
df

Unnamed: 0,trend,n_posts
0,#ConClaudiaGanamos,"4,291 posts"
1,Paramore,19.8K posts
2,#FelizJueves,10.9K posts
3,HOY SE ESTRENA BOBO,11.9K posts
4,Unionistas,33.2K posts
5,Milei,768K posts
6,Andrés Guardado,"5,907 posts"
7,Davos,816K posts
8,Balde,14.1K posts
9,MAÑANA BOBO EN SPOTIFY,42.7K posts


In [28]:
# Initialize the WebDriver
driver = webdriver.Chrome()

# Navigate to a public Wikipedia page with a long list
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
driver.get(url)

# Define scrolling parameters
scroll_distance = 2500
scroll_pos = [1]  # List to track scroll positions
i = 0  # Counter for scroll actions
t = 0  # Index tracker for scroll positions

# Infinite scroll loop
while True:
    try:
        # Allow time for the page to load
        time.sleep(2)

        # Get the current scroll position
        current_scroll_pos = driver.execute_script("return window.scrollY;")
        scroll_pos.append(current_scroll_pos)

        # Scroll down by the specified distance
        driver.execute_script(f'window.scrollBy(0, {scroll_distance});')
        time.sleep(2)  # Allow additional time for dynamic content to load

        t += 1
        print(f'Scroll {i}')
        print(f'Previous: {scroll_pos[t-1]}, Current: {scroll_pos[t]}')

        # Check if the end of the page is reached
        if scroll_pos[t-1] == scroll_pos[t]:
            print("You've reached the end of the page.")
            break

        i += 1
    except Exception as e:
        print(f"An error occurred during scrolling: {e}")
        break

# Close the WebDriver
driver.quit()

Scroll 0
Previous: 1, Current: 0
Scroll 1
Previous: 0, Current: 2500
Scroll 2
Previous: 2500, Current: 5000
Scroll 3
Previous: 5000, Current: 7295
Scroll 4
Previous: 7295, Current: 7295
You've reached the end of the page.


In [43]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np

# Initialize the WebDriver
driver = webdriver.Chrome()

# Navigate to a public Wikipedia page (List of largest companies by revenue)
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
driver.get(url)

# Define scrolling parameters
scroll_distance = 2500  # Distance to scroll each time
scroll_pos = [1]  # Track scroll positions to detect when we reach the end
CompanyNames = []  # List to store company names
Revenues = []  # List to store revenues
Profits = []  # List to store profits

# Locate the initial content (rows of the Wikipedia table)
articles = driver.find_elements(By.XPATH, "//table[contains(@class, 'wikitable')]//tr")

# Initialize variables for scrolling logic
t = 0  # Counter for scroll attempts

# Infinite scroll loop to simulate data scraping
while True:
    time.sleep(2)  # Allow the page to load

    # Get the current scroll position
    current_scroll_pos = driver.execute_script("return window.scrollY;")
    scroll_pos.append(current_scroll_pos)

    # Process visible elements (in this example, table rows)
    for article in articles:
        try:
            # Scrape the company name
            CompanyName = article.find_element(By.XPATH, ".//td[1]").text
            CompanyNames.append(CompanyName)
        except:
            CompanyNames.append(np.nan)  # Handle missing data

        try:
            # Scrape the revenue (update XPath as per table structure)
            Revenue = article.find_element(By.XPATH, ".//td[2]").text
            Revenues.append(Revenue)
        except:
            Revenues.append(np.nan)

        try:
            # Scrape the profit (update XPath as per table structure)
            Profit = article.find_element(By.XPATH, ".//td[3]").text
            Profits.append(Profit)
        except:
            Profits.append(np.nan)

    # Scroll down by the specified distance
    driver.execute_script(f'window.scrollBy(0, {scroll_distance});')
    time.sleep(5)  # Allow additional time for dynamic content to load

    # Re-fetch visible elements after scrolling
    articles = driver.find_elements(By.XPATH, "//table[contains(@class, 'wikitable')]//tr")

    t += 1  # Increment scroll counter
    print(scroll_pos[t-1], scroll_pos[t])  # Print scroll progress

    # Check if the end of the page is reached
    if scroll_pos[t-1] == scroll_pos[t]:
        print("you've reached the end")
        break

# Create a DataFrame with the scraped data
df = pd.DataFrame(zip(CompanyNames, Revenues, Profits),
                  columns=['Company Name', 'Revenue', 'Profit'])

# Save the DataFrame to a CSV file for analysis
df.to_csv('../data/wikipedia_scrape_results_cleaned.csv', index=False)

# Close the WebDriver
driver.quit()

# Print confirmation message
print("Scraping completed and saved to 'wikipedia_scrape_results_cleaned.csv'.")

1 0
0 2500
2500 5000
5000 7086.5
7086.5 7086.5
you've reached the end
Scraping completed and saved to 'wikipedia_scrape_results_cleaned.csv'.


In [34]:
print(df['Company Name'][50])
print(df['Revenue'][50])
print(df['Profit'][50])


Citigroup
Financials
$156,820


In [35]:
# Rename the 'Revenue' column to 'Industry'
df.rename(columns={'Revenue': 'Industry'}, inplace=True)

In [36]:
# Convert 'Profit' column to numeric (remove any non-numeric characters like '$' and ',')
df['Profit'] = df['Profit'].replace('[\\$,]', '', regex=True).astype(float)

In [37]:
print(df['Profit'][50])

156820.0


In [38]:
# Group by 'Industry' and calculate statistics
industry_summary = df.groupby('Industry').agg({
    'Company Name': 'count',       # Number of companies per industry
    'Profit': ['sum', 'mean']      # Total and average profit per industry
}).reset_index()

In [40]:

# Rename columns for clarity
industry_summary.columns = ['Industry', 'Company Count', 'Total Profit', 'Average Profit']

# Sort by 'Total Profit' in descending order
industry_summary = industry_summary.sort_values(by='Total Profit', ascending=False)

# Display the summarized DataFrame
print(industry_summary)

                          Industry  Company Count  Total Profit  \
15                     Oil and gas             40    13476855.0   
13                      Healthcare             35     9360695.0   
12                      Financials             40     8598375.0   
5                       Automotive             35     7735540.0   
16                          Retail             15     5354955.0   
14          Information technology             15     4679005.0   
6                      Commodities             15     4310545.0   
8                     Construction             15     3299205.0   
17  Retail\ninformation technology              5     2873925.0   
9                      Electricity              5     2729740.0   
10                     Electronics             10     1980665.0   
7                     Conglomerate             10     1712500.0   
18                           Steel              5      786080.0   
11                          Energy              5      735500.