In [1]:
import pandas as pd
import csv
import re
import concurrent.futures
import tqdm
import numpy as np
from collections import Counter 

# Lexicon

In [2]:
# cols = ['Emoji', 'Position', 'Unicode name']
# lexicon_df = pd.read_csv('Emoji_Sentiment_Data_v1.0.csv',  usecols = cols)
lexicon_df = pd.read_csv('Emoji_Sentiment_Data_v1.0.csv')
# lexicon_df['Occurrence'] = 0
lexicon_df.head()

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


## Only keep Top 751 out of 969 

In [3]:
lexicon_df.sort_values('Occurrences', ascending=False)
lexicon_df = lexicon_df[:751]
lexicon_df['Score'] =  (lexicon_df['Positive']/lexicon_df['Occurrences']) - (lexicon_df['Negative']/lexicon_df['Occurrences'])
lexicon_df

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,Score
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.220968
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,0.746087
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,0.657615
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.677937
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,-0.093377
...,...,...,...,...,...,...,...,...,...,...
746,♮,0x266e,5,0.936640,0,4,1,MUSIC NATURAL SIGN,Miscellaneous Symbols,0.200000
747,🅾,0x1f17e,5,0.977469,2,2,1,NEGATIVE SQUARED LATIN CAPITAL LETTER O,Enclosed Alphanumeric Supplement,-0.200000
748,🔄,0x1f504,5,0.971014,0,5,0,ANTICLOCKWISE DOWNWARDS AND UPWARDS OPEN CIRCL...,Miscellaneous Symbols and Pictographs,0.000000
749,☄,0x2604,5,0.435374,0,5,0,COMET,Miscellaneous Symbols,0.000000


# Emoji processing

In [4]:
# create a hashmap of emojis and the emoji sentiment score
# for faster lookup
emo_score = lexicon_df[['Emoji', 'Score']]
emo_score = emo_score.sort_values('Emoji')
emo_score = emo_score.set_index('Emoji')
emo_score

Unnamed: 0_level_0,Score
Emoji,Unnamed: 1_level_1
¦,0.625000
©,0.117788
®,0.284672
۞,0.000000
۩,0.000000
...,...
🚹,0.769231
🚺,0.200000
🚼,0.666667
🚿,0.705882


In [5]:
es_dict = emo_score.to_dict()
es_dict

{'Score': {'¦': 0.625,
  '©': 0.11778846153846154,
  '®': 0.2846715328467153,
  '۞': 0.0,
  '۩': 0.0,
  '↪': 0.125,
  '↳': 0.0,
  '↾': 0.6666666666666666,
  '↿': 0.6666666666666666,
  '⇧': 0.14285714285714285,
  '⇨': 0.5263157894736842,
  '⇩': 0.0,
  '⌒': 0.7,
  '⌚': 0.23529411764705885,
  '⌛': 0.14285714285714285,
  '⏩': 0.16666666666666666,
  '⏰': 0.5384615384615384,
  '⏳': 0.0,
  'Ⓐ': -0.14285714285714285,
  'Ⓔ': 0.5,
  'Ⓛ': 0.5,
  'Ⓜ': 0.39999999999999997,
  '─': 0.14893617021276595,
  '━': 0.17948717948717946,
  '│': 0.35074626865671643,
  '┃': 0.5,
  '┈': -0.7142857142857142,
  '┊': 1.0,
  '┐': -0.2,
  '┓': 0.6666666666666666,
  '┛': 0.75,
  '┣': 0.6666666666666666,
  '┳': -0.4,
  '┻': -0.5,
  '┼': 0.0,
  '═': 0.016129032258064516,
  '║': 0.1506849315068493,
  '╔': 0.3076923076923077,
  '╗': 0.42857142857142855,
  '╚': 0.3333333333333333,
  '╝': 0.5384615384615384,
  '╠': 0.23076923076923078,
  '╣': 0.0,
  '╥': 0.125,
  '╦': 0.45454545454545453,
  '╩': 0.22727272727272727,
  '╬':

### Convert Emoji Variants to base form

### Variant Selector-16
VS-16 is used added to the unicode which modifies ☹ to ☹️
also does the same for some other emojis such as from ❤ to ❤️. What we need is the unmodified version, because that is the one in the lexicon

In [6]:
sad_face_unicode = '\U00002639'
yellow_sad_face_unicode = sad_face_unicode + '\U0000FE0F'
print('Sad face without Variant 16 modifier: ', sad_face_unicode)
print(es_dict['Score'].get(sad_face_unicode))
print('Sad face with Variant 16 modifier: ', yellow_sad_face_unicode)
print(es_dict['Score'].get(yellow_sad_face_unicode))
# None means the emoji does not have a sentiment score in the Emoji Sentiment Ranking

Sad face without Variant 16 modifier:  ☹
-0.6
Sad face with Variant 16 modifier:  ☹️
None


# Re-start from here
# OPEN TWEETS FILE, CHANGE FILENAME VAR

In [35]:
### tweets1.csv is a copy of tweets3 but not encoded in utf-8 and without full quotes
### I used it for all testings

fn = 'tweets5utf8_cleaned' # open the CLEANED version of the file
extension = '.csv'
tweets_df = pd.read_csv(fn+extension, index_col=0)
tweets_df['score'] = 0.0 # add score row
tweets_df['contains_emoji'] = False 
tweets_df.head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
0,hindi maling tao. maling pagkakataon siguro <l...,2020-04-25T07:16:39.000Z,tl,1008799086038769665,0.0,False
1,resign or loa??,2020-04-25T07:16:42.000Z,en,871745373823352832,0.0,False
2,happy birthday!! <mention> 🥳,2020-04-25T07:16:43.000Z,en,1123217470553149440,0.0,False
3,<mention> wait aabot ako diyan mamaya ep6 nko.,2020-04-25T07:16:46.000Z,tl,190977737,0.0,False
4,haha muka nga🤣🤣🤣 <link>,2020-04-25T07:16:47.000Z,tl,1026836291025915907,0.0,False


In [36]:
# REGEX for emoji modifiers

SKIN_MODS = r'[\U0001F3FB-\U0001F3FF]' # unicode range for light to dark skin tone
MOD1 = r'\U0000FE0F'                   # variant selector-16
MOD_PATTERN = SKIN_MODS + "|" + MOD1

In [37]:
%%time

# convert to record array, then iterrate is faster than df.iterrows
df_rec = tweets_df.to_records()
for row in df_rec:
    text = str(row[1])
    tweets_df.at[row[0], 'cleaned'] = re.sub(MOD_PATTERN, '', text)

Wall time: 8.9 s


# Emoji Mapping (Single-Thread)

In [38]:
# # EMOJI REGEX
# EMOJIS1 = r'[\U000021aa-\U0000fffd]' # ↪ [0x21aa] until � [0xfffd]
# EMOJIS2 = r'[\U0000fffd-\U0001F6c0]' # 🃏 [0x1f0cf] until 🛀 [0x1f6c0]
# PATTERN = EMOJIS1 + "|" + EMOJIS2 
# BY SIR ED

In [39]:
# EMOJI REGEX
EMOJIS_REG = [
    #r'[\U000000a6-\U000000ae]',
    r'[\U000006de-\U000006e9]',
    r'[\U000021aa-\U00002b50]',
    r'[\U0000fffc-\U0000fffd]',
    r'[\U0001f0cf-\U0001f4fb]',
    r'[\U0001f504-\U0001f64f]',
    r'[\U0001f680-\U0001f6c0]',
]

PATTERN = r'[\U000000a6-\U000000ae]'
for reg in EMOJIS_REG:
    PATTERN = PATTERN + '|' + reg 
    
# more precise range
# marami parin tong bungi


In [40]:
%%time
# tweet ind, row (ti, tr)
# emoji ind, row (ei, er)
# es_dict: dictionary of emoji-score pair for faster lookup
Ctr = Counter()
Unique_ctr = Counter()

for row in df_rec:
    text = str(row[1]) # row[1] is tweet text
    emoji_found = re.findall(PATTERN, text) 
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                Unique_ctr[ue] += 1
        # Score Emojis
        tweets_df.at[row[0], 'contains_emoji'] = True # row[0] is index
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                Ctr[emoji] += 1
                tweets_df.at[row[0], 'score'] = tweets_df.at[row[0], 'score'] + es_dict['Score'].get(emoji)

# iterating on records is faster than df.iterrows()
# 9 to 10 secs on full file

Wall time: 10 s


In [41]:
tweets_df.sort_values('score', ascending=False).head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
214451,💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛...,2020-04-28T10:36:00.000Z,und,839650871268958208,96.028972,True
19308,<mention> 💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜...,2020-04-25T12:19:21.000Z,und,1084114463924543489,91.842386,True
167857,<mention> bts 4ever <foreign>(<foreign>-<forei...,2020-04-27T14:58:12.000Z,ja,275932807,65.581039,True
64169,<mention> sorrry naaa❤🍑❤❤🍑❤❤❤❤❤❤🍑❤🍑❤🍑❤🍑❤🍑❤🍑❤🍑❤...,2020-04-26T06:26:32.000Z,tl,1078183783835197440,64.41402,True
340983,<mention> <mention> ☁😊☁☁😊☁😁☁\n☁😊☁☁😊☁☁☁\n☁😊😊😊😊☁...,2020-04-30T17:37:27.000Z,und,1039130475162292225,60.178563,True


In [42]:
tweets_df.sort_values('score').head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
187286,☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹ ☹☹...,2020-04-28T00:46:55.000Z,und,931878433721745410,-82.8,True
340608,<mention> lambs attack 🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑...,2020-04-30T17:30:24.000Z,en,69569649,-25.333333,True
219008,lahat naman naging mali sa inyo! ang selfish s...,2020-04-28T11:34:36.000Z,tl,3224013132,-10.364821,True
257163,nag eat ako burger &amp; fries ha mcdo huhuhuh...,2020-04-29T08:52:50.000Z,tl,391768245,-8.161542,True
297273,laguta pud aning gitagaan kog dunkin donut per...,2020-04-30T05:07:28.000Z,tl,1002532744289611776,-7.8,True


### Separate tweets with emoji and tweets without emoji
If a tweet only has emoji that is not in the lexicon, it is counted as tweet w/out emoji

In [43]:
tweets_emoji_df = tweets_df[tweets_df['contains_emoji'] == True].copy()
tweets_emoji_df.head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
8,me: dy paliti ko mangga kay kalami ikaon mangg...,2020-04-25T07:16:51.000Z,tl,3259768975,1.289391,True
9,<mention> wow good news fren sana maakuwi sya ...,2020-04-25T07:16:52.000Z,tl,1015148826569596928,1.253411,True
12,"wa najud koy paki sa mga manglibak nako, mu ju...",2020-04-25T07:16:55.000Z,tl,2570177227,0.928314,True
13,dont missed to watch the <mention> myx it up e...,2020-04-25T07:16:58.000Z,en,869057489962057730,0.0,True
14,summer heat 😓 <link>,2020-04-25T07:17:03.000Z,en,939284930,-0.080586,True


### Map score into positive or negative sentiment

In [44]:
tweets_emoji_rec = tweets_emoji_df.to_records()
tweets_emoji_df['is_positive'] = True
tweets_emoji_rec[0]

(8, 'me: dy paliti ko mangga kay kalami ikaon mangga\nmommy: dy pag grocery anay jari an lista ng imo <link>\ndaddy: yesss!! makapanaw na ako! uno paman?? dali kay ganahan ako mag panaw panaw 😊😊', '2020-04-25T07:16:51.000Z', 'tl', 3259768975, 1.28939109, True)

In [45]:
for row in tweets_emoji_rec:
    if row[5] < 0: 
        tweets_emoji_df.at[row[0], 'is_positive'] = False

tweets_emoji_df.sort_values('score', ascending=False)

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
214451,💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛💚💛...,2020-04-28T10:36:00.000Z,und,839650871268958208,96.028972,True,True
19308,<mention> 💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜💜...,2020-04-25T12:19:21.000Z,und,1084114463924543489,91.842386,True,True
167857,<mention> bts 4ever <foreign>(<foreign>-<forei...,2020-04-27T14:58:12.000Z,ja,275932807,65.581039,True,True
64169,<mention> sorrry naaa❤🍑❤❤🍑❤❤❤❤❤❤🍑❤🍑❤🍑❤🍑❤🍑❤🍑❤🍑❤...,2020-04-26T06:26:32.000Z,tl,1078183783835197440,64.414020,True,True
340983,<mention> <mention> ☁😊☁☁😊☁😁☁\n☁😊☁☁😊☁☁☁\n☁😊😊😊😊☁...,2020-04-30T17:37:27.000Z,und,1039130475162292225,60.178563,True,True
...,...,...,...,...,...,...,...
297273,laguta pud aning gitagaan kog dunkin donut per...,2020-04-30T05:07:28.000Z,tl,1002532744289611776,-7.800000,True,False
257163,nag eat ako burger &amp; fries ha mcdo huhuhuh...,2020-04-29T08:52:50.000Z,tl,391768245,-8.161542,True,False
219008,lahat naman naging mali sa inyo! ang selfish s...,2020-04-28T11:34:36.000Z,tl,3224013132,-10.364821,True,False
340608,<mention> lambs attack 🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑...,2020-04-30T17:30:24.000Z,en,69569649,-25.333333,True,False


In [46]:
tweets_no_emoji_df = tweets_df[tweets_df['contains_emoji'] == False]
tweets_no_emoji_df

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
0,hindi maling tao. maling pagkakataon siguro <l...,2020-04-25T07:16:39.000Z,tl,1008799086038769665,0.0,False
1,resign or loa??,2020-04-25T07:16:42.000Z,en,871745373823352832,0.0,False
2,happy birthday!! <mention> 🥳,2020-04-25T07:16:43.000Z,en,1123217470553149440,0.0,False
3,<mention> wait aabot ako diyan mamaya ep6 nko.,2020-04-25T07:16:46.000Z,tl,190977737,0.0,False
4,haha muka nga🤣🤣🤣 <link>,2020-04-25T07:16:47.000Z,tl,1026836291025915907,0.0,False
...,...,...,...,...,...,...
555908,paalala ko lang!? \npagod kana ba ? laban lang...,2020-05-05T05:21:41.000Z,tl,1245183333673254912,0.0,False
555910,hapakainit!!!!!,2020-05-05T05:21:43.000Z,tl,1138027057122689024,0.0,False
555912,<mention> wag na lng dhen ako tatawa sa susunod,2020-05-05T05:21:48.000Z,tl,1246792944247398401,0.0,False
555913,ambata pa pala <link>,2020-05-05T05:21:49.000Z,tl,719335525740511232,0.0,False


### Count how many times an emoji appeared in a positive/negative tweet

In [47]:
tweets_positive = tweets_emoji_df[tweets_emoji_df['is_positive'] == True]
tweets_positive

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
8,me: dy paliti ko mangga kay kalami ikaon mangg...,2020-04-25T07:16:51.000Z,tl,3259768975,1.289391,True,True
9,<mention> wow good news fren sana maakuwi sya ...,2020-04-25T07:16:52.000Z,tl,1015148826569596928,1.253411,True,True
12,"wa najud koy paki sa mga manglibak nako, mu ju...",2020-04-25T07:16:55.000Z,tl,2570177227,0.928314,True,True
13,dont missed to watch the <mention> myx it up e...,2020-04-25T07:16:58.000Z,en,869057489962057730,0.000000,True,True
15,<mention> hahahahahahaha tse. isa lang gusto m...,2020-04-25T07:17:05.000Z,tl,1071653745702780928,0.220968,True,True
...,...,...,...,...,...,...,...
555905,ang <link>☀🌡,2020-05-05T05:21:38.000Z,tl,1140509448751833089,0.466921,True,True
555906,tawang tawa ko napakarandom ng tooic namin kag...,2020-05-05T05:21:39.000Z,tl,1023742873,0.662905,True,True
555909,<mention> thank you!! ingat din kayo ng fam mo...,2020-05-05T05:21:42.000Z,tl,1319421044,0.713381,True,True
555911,amen 😇 <link>,2020-05-05T05:21:46.000Z,en,26676581,0.600000,True,True


In [48]:
tweets_negative = tweets_emoji_df[tweets_emoji_df['is_positive'] == False]
tweets_negative

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
14,summer heat 😓 <link>,2020-04-25T07:17:03.000Z,en,939284930,-0.080586,True,False
49,😒,2020-04-25T07:17:45.000Z,und,787452237740056576,-0.374729,True,False
56,una palang wala na pano pa kaya sa susunod? 🙄☹...,2020-04-25T07:18:01.000Z,tl,948134457742991360,-0.600000,True,False
62,ayoko na ngang umiyak sabi eh pero yung mga wo...,2020-04-25T07:18:08.000Z,tl,1146943014859788289,-0.093377,True,False
69,skl gitan-aw nako ang the grudge (2020) kagabi...,2020-04-25T07:18:13.000Z,tl,2262349260,-0.177460,True,False
...,...,...,...,...,...,...,...
555855,bored na bored na gyud ko ba 🙄😪,2020-05-05T05:20:32.000Z,tl,1201264575993090048,-0.080913,True,False
555863,"what's happening, i can't get heart beats sinc...",2020-05-05T05:20:43.000Z,en,2296258884,-0.280130,True,False
555867,"taasa na gyd sa ako buhok, taas pas ako suweld...",2020-05-05T05:20:50.000Z,tl,2800037532,-0.600000,True,False
555868,na bother ko sa reactions nila. bat maraming t...,2020-05-05T05:20:54.000Z,tl,760682162798600192,-0.314381,True,False


In [49]:
tweets_pos_rec = tweets_positive.to_records()
tweets_pos_rec[0]

(8, 'me: dy paliti ko mangga kay kalami ikaon mangga\nmommy: dy pag grocery anay jari an lista ng imo <link>\ndaddy: yesss!! makapanaw na ako! uno paman?? dali kay ganahan ako mag panaw panaw 😊😊', '2020-04-25T07:16:51.000Z', 'tl', 3259768975, 1.28939109, True, True)

In [50]:
%%time
pos_ctr = Counter()
pos_unique_ctr = Counter()

tweets_pos_rec = tweets_positive.to_records()
for row in tweets_pos_rec:
    emoji_found = re.findall(PATTERN, row[1]) # row[1] is text
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                pos_unique_ctr[ue] += 1
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                pos_ctr[emoji] += 1

Wall time: 1.03 s


In [51]:
pos_ctr

Counter({'😊': 5889,
         '🙏': 6444,
         '😉': 1707,
         '✌': 1363,
         '😂': 36502,
         '😍': 14699,
         '😢': 2327,
         '☺': 2296,
         '🎶': 1389,
         '👇': 407,
         '😎': 599,
         '🌹': 190,
         '😅': 6708,
         '🍺': 134,
         '😘': 5692,
         '🌊': 277,
         '😩': 886,
         '💓': 1556,
         '❤': 25292,
         '💖': 3991,
         '♥': 3296,
         '🍻': 383,
         '💋': 495,
         '🍕': 179,
         '😳': 1097,
         '💚': 2077,
         '👩': 249,
         '💕': 3650,
         '💯': 2550,
         '👌': 1510,
         '♂': 3386,
         '💦': 768,
         '😄': 633,
         '💜': 3452,
         '😭': 4073,
         '🙈': 1713,
         '☹': 347,
         '😟': 249,
         '😁': 3452,
         '💆': 131,
         '😹': 405,
         '🐷': 229,
         '🐽': 68,
         '🙉': 112,
         '♡': 378,
         '😌': 1614,
         '😆': 2693,
         '💗': 1600,
         '💘': 456,
         '👀': 809,
         '🔥': 2269,


In [52]:
%%time
neg_ctr = Counter()
neg_unique_ctr = Counter()

tweets_neg_rec = tweets_negative.to_records()
for row in tweets_neg_rec:
    emoji_found = re.findall(PATTERN, row[1]) # row[1] is text
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                neg_unique_ctr[ue] += 1
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                neg_ctr[emoji] += 1

Wall time: 205 ms


### Save Tweets into csv

In [53]:
# to add
#tweets_df.to_csv(filename+'_cleaned'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)
tweets_emoji_df.to_csv(fn+'_emoji'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)
tweets_no_emoji_df.to_csv(fn+'_no_emoji'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)

### Save counter into csv

In [54]:
count_df = pd.DataFrame.from_dict(Ctr, orient='index')
count_df = count_df.rename(columns={0:'total_count'})

unique_count_df = pd.DataFrame.from_dict(Unique_ctr, orient='index')
unique_count_df = unique_count_df.rename(columns={0:'unique_count'})

pos_unique_count_df = pd.DataFrame.from_dict(pos_unique_ctr, orient='index')
pos_unique_count_df = pos_unique_count_df.rename(columns={0:'unique_count_in_positive_tweets'})

pos_count_df = pd.DataFrame.from_dict(pos_ctr, orient='index')
pos_count_df = pos_count_df.rename(columns={0:'total_count_in_positive_tweets'})

neg_unique_count_df = pd.DataFrame.from_dict(neg_unique_ctr, orient='index')
neg_unique_count_df = neg_unique_count_df.rename(columns={0:'unique_count_in_negative_tweets'})

neg_count_df = pd.DataFrame.from_dict(neg_ctr, orient='index')
neg_count_df = neg_count_df.rename(columns={0:'total_count_in_negative_tweets'})

In [55]:
count_df

Unnamed: 0,total_count
😊,5890
🙏,6461
😉,1709
✌,1363
😓,439
...,...
🚓,1
🚐,1
🎻,1
🆒,1


In [56]:
unique_count_df

Unnamed: 0,unique_count
😊,5104
🙏,4480
✌,1203
😉,1578
😓,388
...,...
🚐,1
🚓,1
🎻,1
🆒,1


In [57]:
metrics_df = count_df.join(unique_count_df, how="outer")
mpos_df = pos_count_df.join(pos_unique_count_df, how="outer")
mneg_df = neg_count_df.join(neg_unique_count_df, how="outer")
metrics_df = metrics_df.join(mpos_df, how="outer")
metrics_df = metrics_df.join(mneg_df, how="outer")
metrics_df = metrics_df.fillna(0).sort_values('total_count', ascending=False)#.applymap(np.int64)

In [58]:
metrics_df

Unnamed: 0,total_count,unique_count,total_count_in_positive_tweets,unique_count_in_positive_tweets,total_count_in_negative_tweets,unique_count_in_negative_tweets
😂,36747,24974,36502.0,24758.0,245.0,216.0
❤,25303,18340,25292.0,18330.0,11.0,10.0
😭,21352,11490,4073.0,2846.0,17279.0,8644.0
😍,14704,9690,14699.0,9685.0,5.0,5.0
😅,6770,5842,6708.0,5782.0,62.0,60.0
...,...,...,...,...,...,...
👡,1,1,1.0,1.0,0.0,0.0
👛,1,1,1.0,1.0,0.0,0.0
┐,1,1,0.0,0.0,1.0,1.0
🚂,1,1,0.0,0.0,1.0,1.0


In [59]:
metrics_df.to_csv(fn+'_metrics'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [32]:
# df1 = pd.read_csv(fn+'_metrics.csv').rename(columns={'Unnamed: 0':'emoji'})
# df2 = pd.read_csv(fn+'_metrics.csv').rename(columns={'Unnamed: 0':'emoji'})

In [33]:
# df1.head()

In [34]:
# pd.concat([df1, df2]).groupby(['emoji']).sum().sort_values('total_count', ascending=False)