In [1]:
#!pip install emoji_extractor

In [2]:
from io import IOBase
from emoji_extractor.extract import Extractor
from collections import defaultdict
import os, gzip
import json
import time
import pickle
from datetime import datetime
import pandas as pd

extract = Extractor()

In [3]:
from typing import Iterable
from collections import deque
from tqdm import tqdm

def count_lines(fd: IOBase):
    current_position = fd.tell()
    fd.seek(0)
    num_lines = sum(1 for _ in fd)
    fd.seek(current_position, 0)
    return num_lines


def separate_emoji_components(emoji: str) -> Iterable[str]:
    yield from component in emoji:

def count_emojis(data_path: str, output_path: str=None, ignore_retweets: bool=True) -> defaultdict(tuple):
    """
    creates a dict of time series emoji counts of the form {date: (daily_tweet_count, {emoji: emoji_count})}
    """
    res = defaultdict(list)
    for file in sorted([os.listdir(data_path)[0]]):
        if os.path.isfile(os.path.join(data_path, file)):
            with gzip.open(os.path.join(data_path, file), 'rb') as f:
                total_tweets = count_lines(f)
                print(f"Start Processing {total_tweets} tweets")
                for tweet in tqdm(f, total=total_tweets):
                    tweet_json = json.loads(tweet)
                    if 'limit' in tweet_json.keys():
                        continue
                    created_at = tweet_json["created_at"] 
                    date_struct = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')
                    date = datetime(date_struct.tm_year, date_struct.tm_mon, date_struct.tm_mday).strftime('%Y-%m-%d')
                    if 'retweeted_status' in tweet_json.keys():
                        if ignore_retweets:
                            continue
                        if tweet_json['retweeted_status']['truncated']:
                            text = tweet_json['retweeted_status']['extended_tweet']['full_text']
                        else:
                            text = tweet_json['retweeted_status']['text']
                    elif tweet_json['truncated']:
                        text = tweet_json['extended_tweet']['full_text']
                    else:
                        text = tweet_json['text']
                    if date not in res.keys():
                        res[date] = [0, defaultdict(int)]
                    res[date][0] += 1
                    count_emojis_dict = dict(extract.count_emoji(text, check_first=True))
                    for emoji, count in count_emojis_dict.items():
                        for component in separate_emoji_components(emoji):
                            res[date][1][component] += count

    if output_path:
        with open(os.path.join(output_path, "emoji_count_dict.pkl"), "wb") as fout:
            pickle.dump(res, fout)
    return res

In [4]:
import os


data_path = os.path.join("data")
count_moji = count_emojis(data_path)

100%|██████████| 192751/192751 [02:31<00:00, 1273.08it/s]


Start Processing 192751 tweets


In [5]:
holding_hands =  ['🙏']#, '🙏🏽', '🙏🏼', '🙏🏾']
claping_hands = ['👏']#,'👏🏼']
shaking_hands = ['🤝']
names = ['holding_hands', 'claping_hands', 'shaking_hands']

emojis = [holding_hands, claping_hands, shaking_hands]

In [6]:
def emoji_statistics(emoji_groups:list, counts:dict) -> pd.DataFrame:
    data = []
    for date in counts.keys():
        posts_in_day = counts[date][0]
        count = 0
        day_data = {'date': date}
        for i, emoji_group in enumerate(emoji_groups):
            for moji in emoji_group:
                count += counts[date][1][moji]
            norm_count = (count / posts_in_day) * 100
            day_data.update({names[i]: norm_count})
        data.append(day_data)
    return pd.DataFrame(data)
        

In [7]:
df = emoji_statistics(emojis, count_moji)

In [8]:
df.head()

Unnamed: 0,date,holding_hands,claping_hands,shaking_hands
0,2020-04-04,1.063363,1.578601,1.622451
1,2020-04-05,1.034383,1.421827,1.432639
