# Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import random
import matplotlib.pyplot as plt

import pandas as pd

## scraping titulares

In [14]:
news_url = "https://www.lavozdegalicia.es/coruna/"
response = requests.get(news_url)
soup = BeautifulSoup(response.text, 'html.parser')

news_keys = ['title','news_href']
    
list_news = [
    {
        news_keys[0]: news.find(class_="a-min-content").text.replace('\n', " ").split('\t')[0].strip(),
        news_keys[1]: news_url+news.find("a").get("href")
    }
    for news in soup.find_all(class_="article-min")]
    
list_news


[{'title': 'La basura llena las calles de A Coruña y la alcaldesa exige a Prezero que la recoja: «Non me vai tremer a man»  Inés Rey reconoce que hay contenedores de la ciudad donde los desechos no fueron retirados, pero no responde a la amenaza del STL: «Non vou entrar en provocacións»',
  'news_href': 'https://www.lavozdegalicia.es/coruna//noticia/coruna/coruna/2024/02/08/basura-llena-calles-coruna-alcaldesa-exige-prezero-recoja-span-langglnon-me-vai-tremer-manspan/00031707385583568720206.htm'},
 {'title': 'El líder del STL: «A la alcaldesa le tiene que llegar la mierda al tercer piso»  Mes y medio después del acuerdo de Navidad, Miguel Sánchez abre otro conflicto en el servicio de recogida de basura en un nuevo intento de asegurarse los contratos temporales',
  'news_href': 'https://www.lavozdegalicia.es/coruna//noticia/coruna/2024/02/08/lider-stl-alcaldesa-llegar-mierda-tercer-piso/0003_202402H8C4996.htm'},
 {'title': 'Las fuertes rachas de viento impiden el aterrizaje en Alvedro y

In [None]:
url_movies = "https://www.sensacine.com/cines/cine/E0770/"
response = requests.get(url_movies)
soup = BeautifulSoup(response.text, 'html.parser')

movie_keys = ['title','synopsis', 'image', 'link_ref']

list_movies = [
    {
        movie_keys[0]: movie.find(class_="meta-title-link").text,
        movie_keys[1]: movie.find(class_="synopsis").text.replace("\n", ""),
        movie_keys[2]: movie.find("img").get("src") if movie.find("img").get("src").startswith("https") else movie.find("img").get("data-src"),
        movie_keys[3]: "https://www.sensacine.com" + movie.find(class_="meta-title-link").get("href")
    }
    for movie in soup.find_all(class_="movie-card-theater")]

list_movies

## LEB ORO 

In [None]:
url_feb = "https://baloncestoenvivo.feb.es/resultados/ligaleboro/1/2023"
response = requests.get(url_feb)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find(id="_ctl0_MainContentPlaceHolderMaster_jornadaDataGrid")

match_list = []

leb_match_keys = ['teams', 'result', 'date', 'time']

for fila in table.find_all('tr')[1:]:
    temp_dict = {}
    
    for key, linha in zip(leb_match_keys, fila.find_all(['td', 'th'])):
        temp_dict[key] = linha.get_text(strip=True)
    
    match_list.append(temp_dict)

match_list

In [None]:
import matplotlib.pyplot as plt

ladder = soup.find(id="_ctl0_MainContentPlaceHolderMaster_clasificacionDataGrid")

ladder_list = []

leb_ladder_keys = ['POSITION', 'TEAMS', 'MATCHES_PLAYED', 'MATCHES_W', 'MATCHES_L', 'POINTS']

for row in ladder.find_all('tr')[1:]:
    temp_dict = {}

    cells = row.find_all(['td', 'th'])
    for key, cell in zip(leb_ladder_keys, cells):
        if key == 'POINTS':
            temp_dict[key] = cells[7].get_text(strip=True) 
        else:
            temp_dict[key] = cell.get_text(strip=True)

    ladder_list.append(temp_dict)

df = pd.DataFrame(ladder_list)

fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')

col_widths = [0.1, 0.4, 0.2, 0.2, 0.2, 0.2]

table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center', colWidths=col_widths)

plt.tight_layout()
plt.show()


# Pruebas

In [None]:
url_feb = "https://baloncestoenvivo.feb.es/resultados/ligaleboro/1/2023"
response = requests.get(url_feb)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find(id="_ctl0_MainContentPlaceHolderMaster_jornadaDataGrid")

match_list = []
leb_match_keys = ['teams', 'result', 'date', 'time']

# Iterar filas ignorando la primera
for fila in table.find_all('tr')[1:]:
    temp_dict = {}
    
    for key, row in zip(leb_match_keys, fila.find_all(['td', 'th'])):
        temp_dict[key] = row.get_text(strip=True).lower().title()
    
    match_list.append(temp_dict)


df = pd.DataFrame(match_list)
fig, ax = plt.subplots(figsize=(1, 1))

ax.axis('off')
colors = ['lightgray', 'white']
cell_colors = [[colors[i % 2] for _ in range(len(df.columns))] for i in range(len(df))]
col_widths = [0.5, 0.1, 0.1, 0.1]

ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center',cellColours=cell_colors, colWidths=col_widths)


fig.set_size_inches(12, 2)
fig.show()

    


In [None]:
url_feb = "https://baloncestoenvivo.feb.es/resultados/ligaleboro/1/2023"
response = requests.get(url_feb)
soup = BeautifulSoup(response.text, 'html.parser')
ladder = soup.find(id="_ctl0_MainContentPlaceHolderMaster_clasificacionDataGrid")

ladder_list = []

leb_ladder_keys = ['POSITION', 'TEAMS', 'MATCHES_PLAYED', 'MATCHES_W', 'MATCHES_L', 'POINTS']

for row in ladder.find_all('tr')[1:]:
    temp_dict = {}

    cells = row.find_all(['td', 'th'])
    for key, cell in zip(leb_ladder_keys, cells):
        if key == 'POINTS':
            temp_dict[key] = cells[7].get_text(strip=True) 
        else:
            temp_dict[key] = cell.get_text(strip=True)

    ladder_list.append(temp_dict)

df = pd.DataFrame(ladder_list)

fig, ax = plt.subplots(figsize=(11, 3))
ax.axis('off')

col_widths = [0.1, 0.4, 0.2, 0.2, 0.2, 0.2]
colors = ['lightgray', 'white']
cell_colors = [[colors[i % 2] for _ in range(len(df.columns))] for i in range(len(df))]
ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center', colWidths=col_widths, cellColours=cell_colors)

fig.set_size_inches(12, 6)

fig.show()