In [1]:
file_path = "../data/farmers-protest-tweets-2021-2-4.json"

In [2]:
import pandas as pd
import json
import time
import datetime
import memory_profiler
#import concurrent

from datetime import datetime
from memory_profiler import memory_usage
from collections import defaultdict, Counter
#from concurrent.futures import ProcessPoolExecutor

### First version of q1_time

In [3]:
def q1_time_pandas(file_path: str):
    # Read the JSON file into a DataFrame
    tweets_df = pd.read_json(file_path, lines=True)

    # Convert the 'date' column to datetime and extract the date
    tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.date

    # Group by 'date' and count tweets, also get the most active user per date
    top_dates_df = tweets_df.groupby('date').agg(
        total_tweets=('date', 'size'),
        most_active_user=('user', lambda x: x.mode()[0]['username'])
    ).sort_values('total_tweets', ascending=False).head(10)

    # Convert the result to the desired format
    top_dates = [(row.name, row['most_active_user']) for _, row in top_dates_df.iterrows()]

    return top_dates

In [9]:
#Executing q1_time_pandas and printing the result
top_dates = q1_time_pandas(file_path)
print(top_dates)

# Timing and memory testing for q1_memory
start_time = time.time()
peak_memory_memory = max(memory_usage((q1_time_pandas, (file_path,))))
end_time = time.time()

print(f"q1_memory - Execution Time: {end_time - start_time} seconds")
print(f"q1_memory - Peak Memory Usage: {peak_memory_memory} MiB")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_memory - Execution Time: 34.3178768157959 seconds
q1_memory - Peak Memory Usage: 3126.28125 MiB


### Second version of q1_time optimizing the dataframe manipulation

In [11]:
def q1_time_pandas_revised(file_path: str):
    # Load the entire dataset into a DataFrame
    df = pd.read_json(file_path, lines=True)

    # Convert 'date' to datetime and extract the date
    df['date'] = pd.to_datetime(df['date']).dt.date

    # Extract username from the nested 'user' dictionary
    df['username'] = df['user'].apply(lambda x: x['username'])

    # Group by 'date' and count tweets, get the most active user per date
    top_dates_df = df.groupby('date').agg(
        total_tweets=('date', 'size'),
        most_active_user=('username', lambda x: x.mode()[0])
    ).sort_values('total_tweets', ascending=False).head(10).drop(columns=['total_tweets'])

    # Convert to the desired format [(date, most_active_user), ...]
    top_dates_with_users = list(top_dates_df.itertuples(index=True, name=None))

    return top_dates_with_users

In [13]:
#Executing q1_time_pandas and printing the result
top_dates = q1_time_pandas_revised(file_path)
print(top_dates)

# Timing and memory testing for q1_time
start_time = time.time()
peak_memory_time = max(memory_usage((q1_time_pandas_revised, (file_path,))))
end_time = time.time()

print(f"q1_time_pandas_revised - Execution Time: {end_time - start_time} seconds")
print(f"q1_time_pandas_revised - Peak Memory Usage: {peak_memory_time} MiB")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_time_pandas_revised - Execution Time: 7.779937028884888 seconds
q1_time_pandas_revised - Peak Memory Usage: 3133.21484375 MiB
