In [27]:
file_path = "../data/farmers-protest-tweets-2021-2-4.json"

### Imports

In [40]:
import pandas as pd
import json
import time
import datetime
import memory_profiler
import re
from typing import List, Tuple
#import concurrent

from datetime import datetime
from memory_profiler import memory_usage
from collections import defaultdict, Counter
#from concurrent.futures import ProcessPoolExecutor

## Q1 Time and Memory solution

### First version of q1_time

In [8]:
def q1_time_pandas(file_path: str):
    # Read the JSON file into a DataFrame
    tweets_df = pd.read_json(file_path, lines=True)

    # Convert the 'date' column to datetime and extract the date
    tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.date

    # Group by 'date' and count tweets, also get the most active user per date
    top_dates_df = tweets_df.groupby('date').agg(
        total_tweets=('date', 'size'),
        most_active_user=('user', lambda x: x.mode()[0]['username'])
    ).sort_values('total_tweets', ascending=False).head(10)

    # Convert the result to the desired format
    top_dates = [(row.name, row['most_active_user']) for _, row in top_dates_df.iterrows()]

    return top_dates

In [9]:
#Executing q1_time_pandas and printing the result
top_dates = q1_time_pandas(file_path)
print(top_dates)

# Timing and memory testing for q1_memory
start_time = time.time()
peak_memory_memory = max(memory_usage((q1_time_pandas, (file_path,))))
end_time = time.time()

print(f"q1_memory - Execution Time: {end_time - start_time} seconds")
print(f"q1_memory - Peak Memory Usage: {peak_memory_memory} MiB")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_memory - Execution Time: 33.76640963554382 seconds
q1_memory - Peak Memory Usage: 2968.328125 MiB


### Second version of q1_time optimizing the dataframe manipulation

In [10]:
def q1_time_pandas_revised(file_path: str):
    # Load the entire dataset into a DataFrame
    df = pd.read_json(file_path, lines=True)

    # Convert 'date' to datetime and extract the date
    df['date'] = pd.to_datetime(df['date']).dt.date

    # Extract username from the nested 'user' dictionary
    df['username'] = df['user'].apply(lambda x: x['username'])

    # Group by 'date' and count tweets, get the most active user per date
    top_dates_df = df.groupby('date').agg(
        total_tweets=('date', 'size'),
        most_active_user=('username', lambda x: x.mode()[0])
    ).sort_values('total_tweets', ascending=False).head(10).drop(columns=['total_tweets'])

    # Convert to the desired format [(date, most_active_user), ...]
    top_dates = list(top_dates_df.itertuples(index=True, name=None))

    return top_dates

In [11]:
#Executing q1_time_pandas and printing the result
top_dates = q1_time_pandas_revised(file_path)
print(top_dates)

# Timing and memory testing for q1_time
start_time = time.time()
peak_memory_time = max(memory_usage((q1_time_pandas_revised, (file_path,))))
end_time = time.time()

print(f"q1_time_pandas_revised - Execution Time: {end_time - start_time} seconds")
print(f"q1_time_pandas_revised - Peak Memory Usage: {peak_memory_time} MiB")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_time_pandas_revised - Execution Time: 7.851444482803345 seconds
q1_time_pandas_revised - Peak Memory Usage: 2991.9140625 MiB


### First version of q1_memory using line by line reading and abandoning pandas

In [22]:
def q1_memory(file_path: str):
    tweet_counts = Counter()
    user_activity = defaultdict(Counter)

    # Read and process the JSON file
    with open(file_path, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            date = datetime.fromisoformat(tweet['date']).date()
            username = tweet['user']['username']

            # Count tweets per date and track user activity
            tweet_counts[date] += 1
            user_activity[date][username] += 1

    # Identify top 10 dates
    top_dates = tweet_counts.most_common(10)

    # Find the most active user for each top date
    top_dates = [(date, user_activity[date].most_common(1)[0][0]) for date, _ in top_dates]

    return top_dates

In [23]:
#Executing q1_time_pandas and printing the result
top_dates = q1_memory(file_path)
print(top_dates)

# Timing and memory testing for q1_memory
start_time = time.time()
peak_memory_memory = max(memory_usage((q1_memory, (file_path,))))
end_time = time.time()

print(f"q1_memory - Execution Time: {end_time - start_time} seconds")
print(f"q1_memory - Peak Memory Usage: {peak_memory_memory} MiB")

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'rebelpacifist'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 15), 'jot__b'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 19), 'Preetm91')]
q1_memory - Execution Time: 3.816939115524292 seconds
q1_memory - Peak Memory Usage: 59.87109375 MiB


## Q2 Time and Memory solution

### Q2 Pandas approach

In [44]:
def q2_pandas(file_path: str) -> List[Tuple[str, int]]:
    # Define the emoji pattern
    emoji_pattern = re.compile('[\U0001F600-\U0001F64F]')

    # Load the dataset into a DataFrame
    df = pd.read_json(file_path, lines=True)

    # Extract emojis from the 'content' field
    df['emojis'] = df['content'].apply(lambda x: emoji_pattern.findall(x))

    # Explode the DataFrame to have one emoji per row
    df_emojis = df.explode('emojis')

    # Count the occurrences of each emoji and get the top 10
    top_emojis = df_emojis['emojis'].value_counts().head(10)

    return top_emojis.reset_index().rename(columns={'index': 'emoji', 'emojis': 'count'})


In [46]:
#Executing q2_pandas and printing the result
top_emojis_df = q2_pandas(file_path)
print(top_emojis_df)
# Timing and memory testing for q1_memory
start_time = time.time()
peak_memory_memory = max(memory_usage((q2_pandas, (file_path,))))
end_time = time.time()

print(f"q2_pandas - Execution Time: {end_time - start_time} seconds")
print(f"q2_pandas - Peak Memory Usage: {peak_memory_memory} MiB")

   count  count
0      🙏   7286
1      😂   3072
2      😡    378
3      😁    280
4      😊    259
5      😢    225
6      🙄    219
7      🙌    215
8      😀    213
9      😜    202
q2_pandas - Execution Time: 8.545934915542603 seconds
q2_pandas - Peak Memory Usage: 3149.39453125 MiB


### Q2 Time, using regex to reduce the execution time

In [50]:
def q2_time(file_path: str) -> List[Tuple[str, int]]:
    emoji_counter = Counter()
    emoji_pattern = re.compile('[\U0001F600-\U0001F64F]')  # Basic emoji pattern

    with open(file_path, 'r') as file:
        data = file.readlines()

    for line in data:
        tweet = json.loads(line)
        content = tweet.get('content', '')
        emojis = emoji_pattern.findall(content)
        emoji_counter.update(emojis)

    top_emojis = emoji_counter.most_common(10)
    return top_emojis

In [54]:
#Executing q2_time and printing the result
top_emojis = q2_time(file_path)
print(top_emojis)
# Timing and memory testing for q1_memory
start_time = time.time()
peak_memory_memory = max(memory_usage((q2_time, (file_path,))))
end_time = time.time()

print(f"q2_time - Execution Time: {end_time - start_time} seconds")
print(f"q2_time - Peak Memory Usage: {peak_memory_memory} MiB")

[('🙏', 7286), ('😂', 3072), ('😡', 378), ('😁', 280), ('😊', 259), ('😢', 225), ('🙄', 219), ('🙌', 215), ('😀', 213), ('😜', 202)]
q2_time - Execution Time: 4.186179161071777 seconds
q2_time - Peak Memory Usage: 532.83984375 MiB
