En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

Este archivo fue creado en google Colab, si desea ejecutarlo en otro entorno puede ser necesario instalar las siguientes dependencias:

In [None]:
%pip install gdown pandas google.cloud.storage google-cloud-bigquery

In [104]:
import gdown
import pandas as pd
import zipfile
from q1_time import q1_time

from pandas.io import gbq
from google.cloud import storage, bigquery

Obtengo el archivo desde google drive y lo descomprimo

In [16]:
file_path = "farmers-protest-tweets-2021-2-4.json"
url = f'https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis'
gdown.download(url, 'working/tweets.zip', quiet=False)

extracted_dir = 'working/'

# Create a ZipFile object and extract the contents
with zipfile.ZipFile('working/tweets.zip', 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)


Downloading...
From (uriginal): https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis
From (redirected): https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis&confirm=t&uuid=f935d566-8965-4f0c-93d2-36c28401e056
To: c:\Users\Caruso\source\Latam-DE-challenge\src\working\tweets.zip
100%|██████████| 60.4M/60.4M [00:04<00:00, 14.8MB/s]


Lo subo a Google Cloud Storage para poder trabajar con él más facilmente

In [92]:
from google.cloud import storage

# Variables para conexión a Google Cloud Storage
project_id = "dechallenge"
bucket_name = "dechallenge-tweets"
keyfile_path = "..\creds\dechallenge-51f78ddf0bb6.json"  # JSON key file

storage_client = storage.Client.from_service_account_json(keyfile_path, project=project_id)
bucket = storage_client.bucket(bucket_name)

# Replace these variables with your own file details
destination_blob_name = file_path

# Create a blob (an object) in the bucket
blob = bucket.blob(destination_blob_name)

# Upload the file to the GCS bucket
blob.upload_from_filename(extracted_dir+file_path)

Utilizo Google Bigquery para hacer la consulta.

In [57]:
from google.cloud import bigquery

# Crea un cliente de BigQuery
client = bigquery.Client.from_service_account_json(keyfile_path, project=project_id)

# Especifica el ID del proyecto y el ID del nuevo dataset
dataset_id = "tweets"

# Crea el dataset
dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
dataset_ref = client.dataset(dataset_id, project=project_id)

# Verifica si el dataset ya existe
if not client.get_dataset(dataset_ref, retry=bigquery.DEFAULT_RETRY):
    # Si el dataset no existe, crea el dataset
    dataset = bigquery.Dataset(dataset_ref)
    client.create_dataset(dataset)
    print(f"Dataset {project_id}.{dataset_id} creado con éxito.")
else:
    print(f"El dataset {project_id}.{dataset_id} ya existe.")

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    write_disposition="WRITE_TRUNCATE",
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    max_bad_records=10,
)
uri = "gs://dechallenge-tweets/farmers-protest-tweets-2021-2-4.json"

load_job = client.load_table_from_uri(
    uri,
    "dechallenge.tweets.farmers_protest_tweets",
    location="US",  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table("dechallenge.tweets.farmers_protest_tweets")
print("Loaded {} rows.".format(destination_table.num_rows))

# Ejecuta la consulta de BigQuery

El dataset dechallenge.tweets ya existe.
Loaded 117405 rows.


In [90]:
query = """
WITH TweetCounts AS (
SELECT user.username, CAST(date as Date) as Date, 
    Count(1) OVER (PARTITION BY CAST(date as Date)) as DailyTweets,
    Count(1) OVER (PARTITION BY CAST(date as Date), user.username) as DailyTweetsByUser
FROM `dechallenge.tweets.farmers_protest_tweets`
), TweetCountsRank AS (
SELECT Date, Username, DailyTweets, ROW_NUMBER() OVER (PARTITION BY Date ORDER BY DailyTweetsByUser DESC) as UserDailyRank
FROM TweetCounts
)
SELECT Date, Username FROM TweetCountsRank
where UserDailyRank = 1
order by DailyTweets desc
limit 10
"""

# Ejecuta la consulta y almacena los resultados en un DataFrame
query_job = client.query(query)
# Convierte el RowIterator en una lista de tuplas
result_tuple = [tuple(row.values()) for row in query_job.result()]

# Imprime la lista de tuplas
print(result_tuple)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]


Ejemplo local utilizando pandas.

In [105]:
resultado = q1_time(extracted_dir+file_path)
print(resultado)

AttributeError: 'str' object has no attribute 'values'

Voy a probar leer los datos usando bigquery a ver si mejora la performance.