En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

Este archivo fue creado en google Colab, si desea ejecutarlo en otro entorno puede ser necesario instalar las siguientes dependencias:

In [None]:
%pip install gdown pandas google.cloud.storage google-cloud-bigquery

In [21]:
import gdown
import pandas as pd
import zipfile

from pandas.io import gbq
from pandas import json_normalize
from google.cloud import storage, bigquery

Obtengo el archivo desde google drive y lo descomprimo

In [16]:
file_path = "farmers-protest-tweets-2021-2-4.json"
url = f'https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis'
gdown.download(url, 'working/tweets.zip', quiet=False)

extracted_dir = 'working/'

# Create a ZipFile object and extract the contents
with zipfile.ZipFile('working/tweets.zip', 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)


Downloading...
From (uriginal): https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis
From (redirected): https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis&confirm=t&uuid=f935d566-8965-4f0c-93d2-36c28401e056
To: c:\Users\Caruso\source\Latam-DE-challenge\src\working\tweets.zip
100%|██████████| 60.4M/60.4M [00:04<00:00, 14.8MB/s]


Lo subo a Google Cloud Storage para poder trabajar con él más facilmente

In [19]:
from google.cloud import storage

# Variables para conexión a Google Cloud Storage
project_id = "dechallenge"
bucket_name = "dechallenge-tweets"
keyfile_path = "..\creds\dechallenge-51f78ddf0bb6.json"  # JSON key file

storage_client = storage.Client.from_service_account_json(keyfile_path, project=project_id)
bucket = storage_client.bucket(bucket_name)

# Replace these variables with your own file details
destination_blob_name = file_path

# Create a blob (an object) in the bucket
blob = bucket.blob(destination_blob_name)

# Upload the file to the GCS bucket
blob.upload_from_filename(extracted_dir+file_path)

Utilizo Google Bigquery para hacer la consulta.

In [57]:
from google.cloud import bigquery

# Crea un cliente de BigQuery
client = bigquery.Client.from_service_account_json(keyfile_path, project=project_id)

# Especifica el ID del proyecto y el ID del nuevo dataset
dataset_id = "tweets"

# Crea el dataset
dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
dataset_ref = client.dataset(dataset_id, project=project_id)

# Verifica si el dataset ya existe
if not client.get_dataset(dataset_ref, retry=bigquery.DEFAULT_RETRY):
    # Si el dataset no existe, crea el dataset
    dataset = bigquery.Dataset(dataset_ref)
    client.create_dataset(dataset)
    print(f"Dataset {project_id}.{dataset_id} creado con éxito.")
else:
    print(f"El dataset {project_id}.{dataset_id} ya existe.")

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    write_disposition="WRITE_TRUNCATE",
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    max_bad_records=10,
)
uri = "gs://dechallenge-tweets/farmers-protest-tweets-2021-2-4.json"

load_job = client.load_table_from_uri(
    uri,
    "dechallenge.tweets.farmers_protest_tweets",
    location="US",  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table("dechallenge.tweets.farmers_protest_tweets")
print("Loaded {} rows.".format(destination_table.num_rows))

# Ejecuta la consulta de BigQuery

El dataset dechallenge.tweets ya existe.
Loaded 117405 rows.


In [84]:
query = """
WITH TweetCounts AS (
SELECT user.username, CAST(date as Date) as Date, 
    Count(1) OVER (PARTITION BY CAST(date as Date)) as DailyTweets,
    Count(1) OVER (PARTITION BY CAST(date as Date), user.username) as DailyTweetsByUser
FROM `dechallenge.tweets.farmers_protest_tweets`
), TweetCountsRank AS (
SELECT Date, Username, DailyTweets, ROW_NUMBER() OVER (PARTITION BY Date ORDER BY DailyTweetsByUser DESC) as UserDailyRank
FROM TweetCounts
)
SELECT * FROM TweetCountsRank
where UserDailyRank = 1
order by DailyTweets desc
limit 10
"""

# Ejecuta la consulta y almacena los resultados en un DataFrame
query_job = client.query(query)
for r in query_job.result():
    print(r)

Row((datetime.date(2021, 2, 12), 'RanbirS00614606', 12347, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 13), 'MaanDee08215437', 11296, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 17), 'RaaJVinderkaur', 11086, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 16), 'jot__b', 10443, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 14), 'rebelpacifist', 10249, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 18), 'neetuanjle_nitu', 9625, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 15), 'jot__b', 9197, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRank': 3})
Row((datetime.date(2021, 2, 20), 'MangalJ23056160', 8502, 1), {'Date': 0, 'Username': 1, 'DailyTweets': 2, 'UserDailyRan

Ejemplo local utilizando pandas.

In [13]:
df_tweets = pd.read_json(extracted_dir+file_path, lines=True)

# Tengo que normalizar el objeto user para poder acceder a sus atributos
df_tweets['userName'] = json_normalize(df_tweets['user']).username

df_tweets['date'] = pd.to_datetime(df_tweets['date']).dt.date
# Agrupo por 'userName' y 'date' y cuento las filas para saber cuantos tweets hizo cada usuario por día.
df_tweets = df_tweets.groupby(['userName','date']).size().reset_index(name='countByUserByDay')
# Ahora calculo la cantidad de tweets por día sin agrupar por usuario.
df_tweets['countByDay'] = df_tweets.groupby('date')['countByUserByDay'].transform('sum')
# Transformo la columna 'date' a tipo datetime


# Con la función Rank marco el usuario con más tweets cada día, desempato usando first.
df_tweets['dailyUserRank'] = df_tweets.groupby('date')['countByUserByDay'].rank(ascending=False, method='first')

df_tweets[(df_tweets['dailyUserRank'] == 1)].sort_values(by='countByDay', ascending=False).head(10)[['date','userName','countByDay']]

Unnamed: 0,userName,date,countByDay
23626,RanbirS00614606,2021-02-12,12347
16761,MaanDee08215437,2021-02-13,11296
22650,RaaJVinderkaur,2021-02-17,11087
40318,jot__b,2021-02-16,10443
46423,rebelpacifist,2021-02-14,10249
44085,neetuanjle_nitu,2021-02-18,9625
40317,jot__b,2021-02-15,9197
17343,MangalJ23056160,2021-02-20,8502
28820,Surrypuria,2021-02-23,8417
21929,Preetm91,2021-02-19,8204


Voy a probar leer los datos usando bigquery a ver si mejora la performance.