In [12]:
from tokenize import String
from textblob import TextBlob as tb
import pandas as pd
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer as Sia
from gensim.parsing.preprocessing import remove_stopwords
from pathlib import Path
from collections import defaultdict

%matplotlib inline

nltk.download('vader_lexicon')




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gunin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Data

In [13]:
sia = Sia()
emoji_raw = pd.read_html("https://kt.ijs.si/data/Emoji_sentiment_ranking")
emoji_lookup_df = emoji_raw[0][['Char', 'Sentiment score[-1...+1]']]
emoji_lookup_df.rename(columns={'Sentiment score[-1...+1]': 'sentiment_score'}, inplace = True)


csv_path = Path("stockerbot-export.csv")
tweet_df = pd.read_csv(csv_path, error_bad_lines=False)



b'Skipping line 731: expected 8 fields, saw 13\nSkipping line 2836: expected 8 fields, saw 15\nSkipping line 3058: expected 8 fields, saw 12\nSkipping line 3113: expected 8 fields, saw 12\nSkipping line 3194: expected 8 fields, saw 17\nSkipping line 3205: expected 8 fields, saw 17\nSkipping line 3255: expected 8 fields, saw 17\nSkipping line 3520: expected 8 fields, saw 17\nSkipping line 4078: expected 8 fields, saw 17\nSkipping line 4087: expected 8 fields, saw 17\nSkipping line 4088: expected 8 fields, saw 17\nSkipping line 4499: expected 8 fields, saw 12\n'


## Funnction definitions

In [14]:
def clean_text(input_str: str) -> str:
    input_str = input_str.lower()
    input_str = re.sub("[^A-Za-z0-9]"," ",input_str)
    input_str = re.sub(r'^https?:\/\/.*[\r\n]*', '', input_str, flags=re.MULTILINE)
    input_str = re.sub(r"www.\S+",'',input_str)
    input_str = remove_stopwords(input_str)
    return input_str

def get_tb_score(text: str) -> float:
    return tb(text).sentiment[0]

def get_nltk_score(text: str) -> float:
    return sia.polarity_scores(text)['compound']

def build_sentiment_df(tweet_dataframe, target_column):
    # getting the corresponding data in lists
    raw_tweets = tweet_dataframe[target_column].tolist()
    cleaned_tweets = [clean_text(i) for i in raw_tweets]
    tb_polarity = [tb(i).sentiment[0] for i in cleaned_tweets]
    nltk_polarity = [sia.polarity_scores(i)['compound'] for i in cleaned_tweets]
    
    # building the dataframe
    final_df = pd.DataFrame()
    final_df['tweets'] = raw_tweets
    final_df['cleaned_tweets'] = cleaned_tweets
    final_df['textblob_polarity'] = tb_polarity
    final_df['nltk_polarity'] = nltk_polarity

    final_df = final_df[final_df['tweets'].apply(lambda x: len(x.split(',')) < 280)]
    final_df = final_df.drop_duplicates()
    final_df = final_df[final_df['tweets'].str.contains("RT ") == False] 

    return final_df

def get_emoji_count(tweet_df):
    emoji_count = defaultdict(int)
    for i in tweet_df['tweets']:
        for emoji in re.findall(u'[\U0001f300-\U0001f650]', i):
            emoji_count[emoji] += 1

    return emoji_count


In [16]:
emoji_data = get_emoji_count(build_sentiment_df(tweet_df, 'text'))


In [17]:
sort_count = sorted(emoji_data.items(), key=lambda x: x[1], reverse=True)
sort_count

[('💰', 420),
 ('🙌', 262),
 ('😍', 261),
 ('💸', 256),
 ('😊', 223),
 ('💎', 216),
 ('📈', 165),
 ('📉', 54),
 ('🔥', 54),
 ('🌕', 43),
 ('📢', 34),
 ('🎉', 32),
 ('💵', 21),
 ('👀', 18),
 ('👍', 18),
 ('🌑', 18),
 ('😂', 18),
 ('👇', 15),
 ('😎', 12),
 ('📣', 12),
 ('🔹', 11),
 ('💕', 11),
 ('🏻', 10),
 ('😉', 9),
 ('💼', 7),
 ('💳', 7),
 ('🕘', 6),
 ('🔔', 6),
 ('💪', 5),
 ('👈', 5),
 ('🙄', 5),
 ('😘', 5),
 ('🔰', 5),
 ('💲', 4),
 ('🍾', 4),
 ('💯', 4),
 ('🏼', 4),
 ('🐂', 4),
 ('😋', 4),
 ('🏽', 3),
 ('📊', 3),
 ('😀', 3),
 ('😳', 3),
 ('💥', 3),
 ('😜', 3),
 ('🔌', 2),
 ('🗣', 2),
 ('🎼', 2),
 ('🙏', 2),
 ('🕵', 2),
 ('😅', 2),
 ('👁', 2),
 ('😈', 2),
 ('👽', 2),
 ('🔐', 2),
 ('💩', 2),
 ('👉', 2),
 ('🌊', 2),
 ('👏', 2),
 ('😁', 2),
 ('🖐', 2),
 ('😝', 2),
 ('👙', 2),
 ('💜', 2),
 ('🌭', 2),
 ('😱', 2),
 ('🍑', 2),
 ('😄', 2),
 ('🗡', 1),
 ('🍪', 1),
 ('🐔', 1),
 ('😭', 1),
 ('🍕', 1),
 ('🍺', 1),
 ('👟', 1),
 ('💞', 1),
 ('💘', 1),
 ('🙆', 1),
 ('📅', 1),
 ('🎲', 1),
 ('📡', 1),
 ('🐻', 1),
 ('😆', 1),
 ('🔝', 1),
 ('🔍', 1),
 ('💛', 1),
 ('🎊', 1),
 ('😨', 1),
 (

In [18]:
emoji_sample = sort_count[0:20]
q = emoji_sample[0][0]

In [19]:
print(emoji_lookup_df.loc[emoji_lookup_df.Char == q, 'sentiment_score'])

147    0.251
Name: sentiment_score, dtype: float64


In [22]:
def group_emojis(counts):
    pos_lst=[]
    neg_lst=[]
    for emoji in counts:
        score = emoji_lookup_df.loc[emoji_lookup_df.Char == emoji[0], 'sentiment_score']
        if score.values < 0:
            neg_lst.append(emoji)
        else:
            pos_lst.append(emoji)
    return pos_lst[0:7], neg_lst

In [23]:
group_emojis(sort_count)

  


([('💰', 420),
  ('🙌', 262),
  ('😍', 261),
  ('💸', 256),
  ('😊', 223),
  ('💎', 216),
  ('📈', 165)],
 [('🔌', 2), ('💩', 2), ('😭', 1), ('😨', 1), ('😲', 1), ('😤', 1), ('😒', 1)])