# STEAMDB - WEB SCRAPING

In [21]:
import pandas as pd
from bs4 import BeautifulSoup

#caminho do Arquivo baixado na minha maquina
file_path = r'C:\Users\Notbook I3\Downloads\Steam Summer Sale 2023 · BR · SteamDB.html'

# Lê o arquivo HTML
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Faz o parsing do conteúdo HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Cria uma lista vazia para armazenar os dados
data = []

# Itera sobre cada linha da tabela HTML
for row in soup.find_all('tr'):
    # Encontra os campos name, discount, price, rating e ends_in
    name = row.find('a', class_='b')
    discount = row.find('td', class_='price-discount-major')
    price = None
    for td in row.find_all('td'):
        if td.text.strip().startswith('R$'):
            price = td.text.strip()
            break
    rating = row.find('td', {'data-sort': lambda x: x is not None and '.' in x})
    ends_in = row.find('td', class_='timeago')

    # Encontra os campos started e release com base no campo ends_in
    if ends_in:
        started = ends_in.find_next_sibling('td', class_='timeago')
        if started:
            release = started.find_next_sibling('td', {'data-sort': True})
        else:
            release = None
    else:
        started = None
        release = None

    # Verifica se todos os campos necessários estão presentes
    if name and discount and price and rating and ends_in:
        # Extrai o texto de cada campo e armazena em variáveis
        name_text = name.text.strip() if name.text else ''
        discount_text = discount.text.strip() if discount.text else ''
        price_text = price
        rating_text = rating.text.strip() if rating.text else ''
        ends_in_text = ends_in.text.strip() if ends_in.text else ''
        started_text = started.text.strip() if started and started.text else ''
        release_text = release.text.strip() if release and release.text else ''

        # Adiciona os dados como um dicionário à lista
        data.append({
            'Name': name_text,
            '%': discount_text,
            'Price': price_text,
            'Rating': rating_text,
            'Ends in': ends_in_text,
            'Started': started_text,
            'Release': release_text
        })

# Cria um DataFrame usando os dados coletados
df = pd.DataFrame(data)

# Imprime o DataFrame
df

Unnamed: 0,Name,%,Price,Rating,Ends in,Started,Release
0,Inscryption,-64%,"R$ 28,97",95.54%,in 6 days,8 days ago,Oct 2021
1,Metal: Hellsinger,-50%,"R$ 49,99",94.19%,in 6 days,8 days ago,Sep 2022
2,NEEDY STREAMER OVERLOAD,-50%,"R$ 15,49",93.15%,in 6 days,18 days ago,Jan 2022
3,OCTOPATH TRAVELER II,-53%,"R$ 202,91",93.04%,in 6 days,8 days ago,Feb 2023
4,ENDER LILIES: Quietus of the Knights,-50%,"R$ 24,74",92.53%,in 6 days,18 days ago,Jun 2021
...,...,...,...,...,...,...,...
178,Hentai Asmodeus,-75%,"R$ 1,12",80.28%,in 6 days,8 days ago,Oct 2019
179,Through the Darkest of Times,-80%,"R$ 5,99",80.25%,in 6 days,8 days ago,Jan 2020
180,RIDE 3,-90%,"R$ 7,54",80.20%,in 6 days,8 days ago,Nov 2018
181,Grow: Song of the Evertree,-50%,"R$ 51,95",80.06%,in 6 days,15 days ago,Nov 2021


In [22]:
# Salva o DataFrame em um arquivo CSV
df.to_csv('dados_steamDB_Sales.csv', index=False)

# Subir CSV para o Big Query

In [None]:


from google.cloud import bigquery
import pandas as pd
from bs4 import BeautifulSoup

# Configurações do projeto e dataset no BigQuery
project_id = 'SEU_PROJECT_ID'
dataset_id = 'SEU_DATASET_ID'
table_id = 'NOME_DA_TABELA'

file_path = r'C:\Users\Notbook I3\Desktop\Teste-beAnalytic\dados_steam.csv'

# Lê o arquivo CSV para um DataFrame
df = pd.read_csv(file_path)

# Cria uma instância do cliente BigQuery
client = bigquery.Client(project=project_id)

# Configura os detalhes da tabela
table_ref = client.dataset(dataset_id).table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.autodetect = True

# Carrega o DataFrame para o BigQuery
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()

print('Arquivo CSV carregado para o BigQuery com sucesso.')
