In [57]:
import pandas as pd
import csv
import re
import concurrent.futures
import tqdm
import numpy as np
from collections import Counter 

# Lexicon

In [2]:
# cols = ['Emoji', 'Position', 'Unicode name']
# lexicon_df = pd.read_csv('Emoji_Sentiment_Data_v1.0.csv',  usecols = cols)
lexicon_df = pd.read_csv('Emoji_Sentiment_Data_v1.0.csv')
# lexicon_df['Occurrence'] = 0
lexicon_df.head()

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


## Only keep Top 751 out of 969 

In [3]:
lexicon_df.sort_values('Occurrences', ascending=False)
lexicon_df = lexicon_df[:751]
lexicon_df['Score'] =  (lexicon_df['Positive']/lexicon_df['Occurrences']) - (lexicon_df['Negative']/lexicon_df['Occurrences'])
lexicon_df

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,Score
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.220968
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,0.746087
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,0.657615
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.677937
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,-0.093377
...,...,...,...,...,...,...,...,...,...,...
746,♮,0x266e,5,0.936640,0,4,1,MUSIC NATURAL SIGN,Miscellaneous Symbols,0.200000
747,🅾,0x1f17e,5,0.977469,2,2,1,NEGATIVE SQUARED LATIN CAPITAL LETTER O,Enclosed Alphanumeric Supplement,-0.200000
748,🔄,0x1f504,5,0.971014,0,5,0,ANTICLOCKWISE DOWNWARDS AND UPWARDS OPEN CIRCL...,Miscellaneous Symbols and Pictographs,0.000000
749,☄,0x2604,5,0.435374,0,5,0,COMET,Miscellaneous Symbols,0.000000


# Emoji processing

In [4]:
# create a hashmap of emojis and the emoji sentiment score
# for faster lookup
emo_score = lexicon_df[['Emoji', 'Score']]
emo_score = emo_score.sort_values('Emoji')
emo_score = emo_score.set_index('Emoji')
emo_score

Unnamed: 0_level_0,Score
Emoji,Unnamed: 1_level_1
¦,0.625000
©,0.117788
®,0.284672
۞,0.000000
۩,0.000000
...,...
🚹,0.769231
🚺,0.200000
🚼,0.666667
🚿,0.705882


In [5]:
es_dict = emo_score.to_dict()
es_dict

{'Score': {'¦': 0.625,
  '©': 0.11778846153846154,
  '®': 0.2846715328467153,
  '۞': 0.0,
  '۩': 0.0,
  '↪': 0.125,
  '↳': 0.0,
  '↾': 0.6666666666666666,
  '↿': 0.6666666666666666,
  '⇧': 0.14285714285714285,
  '⇨': 0.5263157894736842,
  '⇩': 0.0,
  '⌒': 0.7,
  '⌚': 0.23529411764705885,
  '⌛': 0.14285714285714285,
  '⏩': 0.16666666666666666,
  '⏰': 0.5384615384615384,
  '⏳': 0.0,
  'Ⓐ': -0.14285714285714285,
  'Ⓔ': 0.5,
  'Ⓛ': 0.5,
  'Ⓜ': 0.39999999999999997,
  '─': 0.14893617021276595,
  '━': 0.17948717948717946,
  '│': 0.35074626865671643,
  '┃': 0.5,
  '┈': -0.7142857142857142,
  '┊': 1.0,
  '┐': -0.2,
  '┓': 0.6666666666666666,
  '┛': 0.75,
  '┣': 0.6666666666666666,
  '┳': -0.4,
  '┻': -0.5,
  '┼': 0.0,
  '═': 0.016129032258064516,
  '║': 0.1506849315068493,
  '╔': 0.3076923076923077,
  '╗': 0.42857142857142855,
  '╚': 0.3333333333333333,
  '╝': 0.5384615384615384,
  '╠': 0.23076923076923078,
  '╣': 0.0,
  '╥': 0.125,
  '╦': 0.45454545454545453,
  '╩': 0.22727272727272727,
  '╬':

### Convert Emoji Variants to base form

### Variant Selector-16
VS-16 is used added to the unicode which modifies ☹ to ☹️
also does the same for some other emojis such as from ❤ to ❤️. What we need is the unmodified version, because that is the one in the lexicon

In [6]:
sad_face_unicode = '\U00002639'
yellow_sad_face_unicode = sad_face_unicode + '\U0000FE0F'
print('Sad face without Variant 16 modifier: ', sad_face_unicode)
print(es_dict['Score'].get(sad_face_unicode))
print('Sad face with Variant 16 modifier: ', yellow_sad_face_unicode)
print(es_dict['Score'].get(yellow_sad_face_unicode))
# None means the emoji does not have a sentiment score in the Emoji Sentiment Ranking

Sad face without Variant 16 modifier:  ☹
-0.6
Sad face with Variant 16 modifier:  ☹️
None


# Re-start from here
# OPEN TWEETS FILE, CHANGE FILENAME VAR

In [7]:
### tweets1.csv is a copy of tweets3 but not encoded in utf-8 and without full quotes
### I used it for all testings

fn = 'tweets1utf8_cleaned' # open the CLEANED version of the file
extension = '.csv'
tweets_df = pd.read_csv(fn+extension, index_col=0)
tweets_df['score'] = 0.0 # add score row
tweets_df['contains_emoji'] = False 
tweets_df.head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
0,<mention> yonnn ikaw ang alay tsong lol,2020-03-25T01:50:43.000Z,tl,76292344,0.0,False
1,panoorin ko ulit gangnam beauty,2020-03-25T01:50:44.000Z,tl,836197243757592578,0.0,False
2,<mention> hahahahahahaha i know u have,2020-03-25T01:50:46.000Z,tl,2918002014,0.0,False
3,<mention> no more hotel accomm. puno na tanan....,2020-03-25T01:50:46.000Z,tl,339543717,0.0,False
4,way kahumanang throwback lang <link>,2020-03-25T01:50:47.000Z,tl,74405457,0.0,False


In [8]:
# REGEX for emoji modifiers

SKIN_MODS = r'[\U0001F3FB-\U0001F3FF]' # unicode range for light to dark skin tone
MOD1 = r'\U0000FE0F'                   # variant selector-16
MOD_PATTERN = SKIN_MODS + "|" + MOD1

In [9]:
%%time

# convert to record array, then iterrate is faster than df.iterrows
df_rec = tweets_df.to_records()
for row in df_rec:
    tweets_df.at[row[0], 'cleaned'] = re.sub(MOD_PATTERN, '', row[1])

Wall time: 8.9 s


# Emoji Mapping (Single-Thread)

In [10]:
# # EMOJI REGEX
# EMOJIS1 = r'[\U000021aa-\U0000fffd]' # ↪ [0x21aa] until � [0xfffd]
# EMOJIS2 = r'[\U0000fffd-\U0001F6c0]' # 🃏 [0x1f0cf] until 🛀 [0x1f6c0]
# PATTERN = EMOJIS1 + "|" + EMOJIS2 
# BY SIR ED

In [11]:
# EMOJI REGEX
EMOJIS_REG = [
    #r'[\U000000a6-\U000000ae]',
    r'[\U000006de-\U000006e9]',
    r'[\U000021aa-\U00002b50]',
    r'[\U0000fffc-\U0000fffd]',
    r'[\U0001f0cf-\U0001f4fb]',
    r'[\U0001f504-\U0001f64f]',
    r'[\U0001f680-\U0001f6c0]',
]

PATTERN = r'[\U000000a6-\U000000ae]'
for reg in EMOJIS_REG:
    PATTERN = PATTERN + '|' + reg 
    
# more precise range
# marami parin tong bungi


In [12]:
%%time
# tweet ind, row (ti, tr)
# emoji ind, row (ei, er)
# es_dict: dictionary of emoji-score pair for faster lookup
Ctr = Counter()
Unique_ctr = Counter()

for row in df_rec:
    emoji_found = re.findall(PATTERN, row[1]) # row[1] is text
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                Unique_ctr[ue] += 1
        # Score Emojis
        tweets_df.at[row[0], 'contains_emoji'] = True # row[0] is index
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                Ctr[emoji] += 1
                tweets_df.at[row[0], 'score'] = tweets_df.at[row[0], 'score'] + es_dict['Score'].get(emoji)

# iterating on records is faster than df.iterrows()
# 9 to 10 secs on full file

Wall time: 9.24 s


In [13]:
tweets_df.sort_values('score', ascending=False).head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
187771,🇱🇦🇱🇧🇱🇨🇱🇮🇱🇰🇱🇷🇱🇸🇱🇹🇱🇺🇱🇻🇱🇾🇲🇦🇲🇨🇲🇩🇲🇪🇲🇫🇲🇬🇲🇭🇲🇰🇲🇱🇲🇲🇲🇳🇲🇴...,2020-03-27T12:16:55.000Z,und,977237372105453568,89.326708,True
152403,<mention> <mention> 💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕...,2020-03-27T03:22:01.000Z,und,736078177,88.608333,True
187655,we heal as one🙏🇦🇨🇦🇩🇦🇪🇦🇫🇦🇬🇦🇮🇦🇱🇦🇲🇦🇴🇦🇶🇦🇷🇦🇸🇦🇹🇦🇺🇦🇼🇦...,2020-03-27T12:15:45.000Z,en,977237372105453568,79.256046,True
231057,<mention> 😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍...,2020-03-28T06:53:13.000Z,und,779890766952804352,56.94669,True
389188,#godhealourland\n\n💙💙💙💙💙💙💙💙💙💙\n⭐ 💙💙💙💙💙💙💙💙💙\n ...,2020-03-30T10:55:02.000Z,und,1075956891396669440,47.288363,True


In [14]:
tweets_df.sort_values('score').head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
267445,☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹☹ <link>,2020-03-28T13:36:42.000Z,und,848381430002401280,-23.4,True
42705,😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶...,2020-03-25T12:30:26.000Z,und,1008479406,-20.314485,True
213669,lami kaayo sa tnan mag teleport dung davao rn ...,2020-03-27T20:40:23.000Z,tl,78216350,-14.485714,True
236796,┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻...,2020-03-28T08:16:48.000Z,et,2998554614,-14.4,True
449555,"hindi maintindihan yung google translate, hind...",2020-03-31T07:19:31.000Z,tl,1181218051280134147,-12.506606,True


### Separate tweets with emoji and tweets without emoji
If a tweet only has emoji that is not in the lexicon, it is counted as tweet w/out emoji

In [15]:
tweets_emoji_df = tweets_df[tweets_df['contains_emoji'] == True].copy()
tweets_emoji_df.head()

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
8,<mention> wow nauumay sa babae jiba na😆🤧,2020-03-25T01:50:50.000Z,tl,1031902770,0.411765,True
10,momsh angge <mention> ung kita tlga ung kilig ...,2020-03-25T01:50:52.000Z,tl,831308328,1.379691,True
14,happy happy birthday mommy!!! i love you very ...,2020-03-25T01:50:54.000Z,en,939403524688900096,5.15293,True
15,"eto na naman tayo, umaga na naman 😊",2020-03-25T01:50:55.000Z,tl,971561419970830336,0.644696,True
18,you so hot💙 <link>,2020-03-25T01:50:56.000Z,en,1185625215599927297,0.732456,True


### Map score into positive or negative sentiment

In [16]:
tweets_emoji_rec = tweets_emoji_df.to_records()
tweets_emoji_df['is_positive'] = True
tweets_emoji_rec[0]

(8, '<mention> wow nauumay sa babae jiba na😆🤧', '2020-03-25T01:50:50.000Z', 'tl', 1031902770, 0.41176471, True)

In [17]:
for row in tweets_emoji_rec:
    if row[5] < 0: 
        tweets_emoji_df.at[row[0], 'is_positive'] = False

tweets_emoji_df.sort_values('score', ascending=False)

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
187771,🇱🇦🇱🇧🇱🇨🇱🇮🇱🇰🇱🇷🇱🇸🇱🇹🇱🇺🇱🇻🇱🇾🇲🇦🇲🇨🇲🇩🇲🇪🇲🇫🇲🇬🇲🇭🇲🇰🇲🇱🇲🇲🇲🇳🇲🇴...,2020-03-27T12:16:55.000Z,und,977237372105453568,89.326708,True,True
152403,<mention> <mention> 💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕💕...,2020-03-27T03:22:01.000Z,und,736078177,88.608333,True,True
187655,we heal as one🙏🇦🇨🇦🇩🇦🇪🇦🇫🇦🇬🇦🇮🇦🇱🇦🇲🇦🇴🇦🇶🇦🇷🇦🇸🇦🇹🇦🇺🇦🇼🇦...,2020-03-27T12:15:45.000Z,en,977237372105453568,79.256046,True,True
231057,<mention> 😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍😍...,2020-03-28T06:53:13.000Z,und,779890766952804352,56.946690,True,True
389188,#godhealourland\n\n💙💙💙💙💙💙💙💙💙💙\n⭐ 💙💙💙💙💙💙💙💙💙\n ...,2020-03-30T10:55:02.000Z,und,1075956891396669440,47.288363,True,True
...,...,...,...,...,...,...,...
449555,"hindi maintindihan yung google translate, hind...",2020-03-31T07:19:31.000Z,tl,1181218051280134147,-12.506606,True,False
236796,┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻┳|\n┳┻|\n┻...,2020-03-28T08:16:48.000Z,et,2998554614,-14.400000,True,False
213669,lami kaayo sa tnan mag teleport dung davao rn ...,2020-03-27T20:40:23.000Z,tl,78216350,-14.485714,True,False
42705,😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶😶...,2020-03-25T12:30:26.000Z,und,1008479406,-20.314485,True,False


In [18]:
tweets_no_emoji_df = tweets_df[tweets_df['contains_emoji'] == False]
tweets_no_emoji_df

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji
0,<mention> yonnn ikaw ang alay tsong lol,2020-03-25T01:50:43.000Z,tl,76292344,0.0,False
1,panoorin ko ulit gangnam beauty,2020-03-25T01:50:44.000Z,tl,836197243757592578,0.0,False
2,<mention> hahahahahahaha i know u have,2020-03-25T01:50:46.000Z,tl,2918002014,0.0,False
3,<mention> no more hotel accomm. puno na tanan....,2020-03-25T01:50:46.000Z,tl,339543717,0.0,False
4,way kahumanang throwback lang <link>,2020-03-25T01:50:47.000Z,tl,74405457,0.0,False
...,...,...,...,...,...,...
542447,<mention> <mention> isnt that how you should n...,2020-04-01T08:56:15.000Z,en,880044986431021056,0.0,False
542448,"di ako sinuyo, naghanap ng iba. hahaha kasi na...",2020-04-01T08:56:16.000Z,tl,1208114580,0.0,False
542450,<mention> account is temporarily unavailable b...,2020-04-01T08:56:17.000Z,tl,1010871456303964160,0.0,False
542451,gi kapoy nakos akong kinabuhi 🥺,2020-04-01T08:56:17.000Z,tl,851267043017895936,0.0,False


### Count how many times an emoji appeared in a positive/negative tweet

In [28]:
tweets_positive = tweets_emoji_df[tweets_emoji_df['is_positive'] == True]
tweets_positive

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
8,<mention> wow nauumay sa babae jiba na😆🤧,2020-03-25T01:50:50.000Z,tl,1031902770,0.411765,True,True
10,momsh angge <mention> ung kita tlga ung kilig ...,2020-03-25T01:50:52.000Z,tl,831308328,1.379691,True,True
14,happy happy birthday mommy!!! i love you very ...,2020-03-25T01:50:54.000Z,en,939403524688900096,5.152930,True,True
15,"eto na naman tayo, umaga na naman 😊",2020-03-25T01:50:55.000Z,tl,971561419970830336,0.644696,True,True
18,you so hot💙 <link>,2020-03-25T01:50:56.000Z,en,1185625215599927297,0.732456,True,True
...,...,...,...,...,...,...,...
542432,😭😘😍😡🥺☺🍑🇫🇷♒🐥🐶👻😣👉😗👦❤🥰🙄🍱🤣💕👈☀🤦‍♂🍌🤬😪📷💯 <link>,2020-04-01T08:56:03.000Z,und,629254076,8.888621,True,True
542433,thankyou po ❤ <mention> <link>,2020-04-01T08:56:04.000Z,en,825364488220090368,0.746087,True,True
542443,hindi ko napigilan antok ko 🤦‍♂🤦‍♂,2020-04-01T08:56:11.000Z,tl,2954367480,0.400000,True,True
542444,<mention> salamat dad. 😘,2020-04-01T08:56:12.000Z,tl,945550163476545536,0.701754,True,True


In [29]:
tweets_negative = tweets_emoji_df[tweets_emoji_df['is_positive'] == False]
tweets_negative

Unnamed: 0,cleaned,date,lang,author,score,contains_emoji,is_positive
37,tangna nararanasan kona hirap at pagod ng buha...,2020-03-25T01:51:11.000Z,tl,4241957205,-0.093377,True,False
66,<mention> same feels same feels. its just like...,2020-03-25T01:51:36.000Z,en,843447828122763264,-0.093377,True,False
138,huyyy itom jud lagiii kooo driii 🤣😭💩💩💩,2020-03-25T01:52:54.000Z,tl,1110205395128082432,-0.447089,True,False
143,miss kona mga pinsan ko🥺🥺🥺🥺😢😪,2020-03-25T01:53:00.000Z,tl,1029307156934230018,-0.074237,True,False
175,asa naman ang quarantine pass 😞,2020-03-25T01:53:40.000Z,tl,293798196,-0.118421,True,False
...,...,...,...,...,...,...,...
542397,haaay kawawa naman ☹ they just wanted some hel...,2020-04-01T08:55:34.000Z,tl,4777079960,-0.600000,True,False
542399,same nightmare again. 💔,2020-04-01T08:55:35.000Z,en,736122890513354752,-0.121951,True,False
542404,<mention> <mention> 😡🔪i demand kisses,2020-04-01T08:55:44.000Z,en,880044986431021056,-0.102242,True,False
542434,😩 <link>,2020-04-01T08:56:04.000Z,und,941794889297051648,-0.368363,True,False


In [35]:
tweets_pos_rec = tweets_positive.to_records()
tweets_pos_rec[0]

(8, '<mention> wow nauumay sa babae jiba na😆🤧', '2020-03-25T01:50:50.000Z', 'tl', 1031902770, 0.41176471, True, True)

In [39]:
%%time
pos_ctr = Counter()
pos_unique_ctr = Counter()

tweets_pos_rec = tweets_positive.to_records()
for row in tweets_pos_rec:
    emoji_found = re.findall(PATTERN, row[1]) # row[1] is text
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                pos_unique_ctr[ue] += 1
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                pos_ctr[emoji] += 1

Wall time: 1.04 s


In [40]:
pos_ctr

Counter({'😆': 2199,
         '😍': 11590,
         '😘': 5825,
         '❤': 23294,
         '🎊': 172,
         '🎉': 902,
         '😊': 4865,
         '💙': 2608,
         '😋': 1898,
         '💕': 3597,
         '🙈': 1466,
         '😇': 1588,
         '❗': 454,
         '♂': 3921,
         '😭': 3462,
         '😂': 34806,
         '💖': 3555,
         '😥': 830,
         '💓': 1509,
         '🍔': 28,
         '🍟': 51,
         '😬': 973,
         '✨': 2033,
         '💁': 336,
         '💜': 1742,
         '🌺': 37,
         '😅': 6028,
         '😁': 2686,
         '😢': 2846,
         '👏': 2247,
         '☺': 2281,
         '✌': 1483,
         '😚': 815,
         '💪': 1464,
         '👌': 1220,
         '🔥': 2237,
         '😟': 280,
         '👇': 335,
         '😎': 545,
         '🙏': 8654,
         '☑': 142,
         '🙌': 1031,
         '👍': 1035,
         '😹': 289,
         '😜': 787,
         '👀': 674,
         '💋': 522,
         '🌙': 155,
         '💗': 1372,
         '♥': 2913,
         '😩': 770,


In [41]:
%%time
neg_ctr = Counter()
neg_unique_ctr = Counter()

tweets_neg_rec = tweets_negative.to_records()
for row in tweets_neg_rec:
    emoji_found = re.findall(PATTERN, row[1]) # row[1] is text
    if len(emoji_found) > 0:
        # Count Unique Emojis
        unique_emojis = set(emoji_found)
        for ue in unique_emojis:
            if es_dict['Score'].get(ue) != None:
                neg_unique_ctr[ue] += 1
        for emoji in emoji_found:
            if es_dict['Score'].get(emoji) != None:
                neg_ctr[emoji] += 1

Wall time: 211 ms


### Save Tweets into csv

In [19]:
# to add
#tweets_df.to_csv(filename+'_cleaned'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)
tweets_emoji_df.to_csv(fn+'_emoji'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)
tweets_no_emoji_df.to_csv(fn+'_no_emoji'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)

### Save counter into csv

In [20]:

count_df

Unnamed: 0,total_count
😆,2203
😍,11596
😘,5827
❤,23311
🎊,173
...,...
🏇,1
🚌,2
👡,1
💷,1


In [51]:
count_df = pd.DataFrame.from_dict(Ctr, orient='index')
count_df = count_df.rename(columns={0:'total_count'})

unique_count_df = pd.DataFrame.from_dict(Unique_ctr, orient='index')
unique_count_df = unique_count_df.rename(columns={0:'unique_count'})

pos_unique_count_df = pd.DataFrame.from_dict(pos_unique_ctr, orient='index')
pos_unique_count_df = pos_unique_count_df.rename(columns={0:'unique_count_in_positive_tweets'})

pos_count_df = pd.DataFrame.from_dict(pos_ctr, orient='index')
pos_count_df = pos_count_df.rename(columns={0:'total_count_in_positive_tweets'})

neg_unique_count_df = pd.DataFrame.from_dict(neg_unique_ctr, orient='index')
neg_unique_count_df = neg_unique_count_df.rename(columns={0:'unique_count_in_negative_tweets'})

neg_count_df = pd.DataFrame.from_dict(neg_ctr, orient='index')
neg_count_df = neg_count_df.rename(columns={0:'total_count_in_negative_tweets'})

In [21]:

unique_count_df

Unnamed: 0,unique_count
😆,1810
😍,7682
😘,4589
🎊,162
❤,16971
...,...
🏇,1
🚌,2
👡,1
💷,1


In [59]:
metrics_df = count_df.join(unique_count_df, how="outer")
mpos_df = pos_count_df.join(pos_unique_count_df, how="outer")
mneg_df = neg_count_df.join(neg_unique_count_df, how="outer")
metrics_df = metrics_df.join(mpos_df, how="outer")
metrics_df = metrics_df.join(mneg_df, how="outer")
metrics_df = metrics_df.fillna(0).sort_values('total_count', ascending=False)#.applymap(np.int64)

In [60]:
metrics_df

Unnamed: 0,total_count,unique_count,total_count_in_positive_tweets,unique_count_in_positive_tweets,total_count_in_negative_tweets,unique_count_in_negative_tweets
😂,35019,23790,34806,23598,213,192
❤,23311,16971,23294,16956,17,15
😭,18776,10541,3462,2478,15314,8063
😍,11596,7682,11590,7676,6,6
🙏,8699,6068,8654,6027,45,41
...,...,...,...,...,...,...
🎽,1,1,1,1,0,0
🏪,1,1,1,1,0,0
👝,1,1,1,1,0,0
📯,1,1,0,0,1,1


In [61]:
metrics_df.to_csv(fn+'_metrics'+extension, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [83]:
# df1 = pd.read_csv(fn+'_metrics.csv').rename(columns={'Unnamed: 0':'emoji'})
# df2 = pd.read_csv(fn+'_metrics.csv').rename(columns={'Unnamed: 0':'emoji'})

In [84]:
# df1.head()

Unnamed: 0,emoji,total_count,unique_count,total_count_in_positive_tweets,unique_count_in_positive_tweets,total_count_in_negative_tweets,unique_count_in_negative_tweets
0,😂,35019,23790,34806,23598,213,192
1,❤,23311,16971,23294,16956,17,15
2,😭,18776,10541,3462,2478,15314,8063
3,😍,11596,7682,11590,7676,6,6
4,🙏,8699,6068,8654,6027,45,41


In [86]:
# pd.concat([df1, df2]).groupby(['emoji']).sum().sort_values('total_count', ascending=False)

Unnamed: 0_level_0,total_count,unique_count,total_count_in_positive_tweets,unique_count_in_positive_tweets,total_count_in_negative_tweets,unique_count_in_negative_tweets
emoji,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
😂,70038,47580,69612,47196,426,384
❤,46622,33942,46588,33912,34,30
😭,37552,21082,6924,4956,30628,16126
😍,23192,15364,23180,15352,12,12
🙏,17398,12136,17308,12054,90,82
...,...,...,...,...,...,...
🎽,2,2,2,2,0,0
🏪,2,2,2,2,0,0
👝,2,2,2,2,0,0
📯,2,2,0,0,2,2
