In [1]:
import pandas as pd
import numpy as np
import emoji
import regex
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize

In [2]:
def loadFile(name):
    data = pd.read_csv(name,dtype={"Message ID":int,"Message type":"category","Text":str,"Country":"category",
                                    "Source type":"category","Source name":str,"Source ID":str,"Datetime":str,
                                    "From":str,"From ID":str,"Reply ID":str,"Forwarded from":str,
                                    "Message with media":"bool","Media type":"category","Different fonts":"bool",
                                    "Action if service":"category","Has link":"bool"},encoding="utf-16")
    data = data.drop("Unnamed: 0",axis=1)
    data["Datetime"] = pd.to_datetime(data["Datetime"])
    return data

In [3]:
# Load file
data = loadFile("Reformated/UkraineGroups.csv")

## Emoji extraction


In [4]:
emojis = {}
errors = 0
error_texts = []
texts_with_emojis = []
texts_all = []

In [5]:
for text in tqdm(data["Text"]):
    try:
        emojis_list = emoji.distinct_emoji_list(text)
    except:
        error_texts.append(text)
        errors += 1
        continue
    if len(emojis_list)>0:
        texts_with_emojis.append(text)
    texts_all.append(text)
    for e in emojis_list:
        if e in emojis:
            emojis[e] += 1
        else:
            emojis[e] = 1

100%|█████████████████████████████████████████████████████████████████████| 3713436/3713436 [01:48<00:00, 34284.51it/s]


#### Statistics

In [6]:
top50emojis = sorted(emojis.items(),key=lambda x:x[1],reverse=True)[:50]
print(top50emojis)

[('😂', 110182), ('🤣', 40590), ('🇺🇦', 32336), ('😁', 32123), ('👍', 23178), ('🤕', 21630), ('🌚', 20807), ('😅', 19078), ('🤔', 15901), ('🙏', 15806), ('❤️', 15727), ('❗', 13904), ('😆', 11328), ('😉', 10207), ('😭', 10130), ('😡', 10045), ('😏', 8918), ('🥺', 8771), ('🗿', 8616), ('😄', 8201), ('😳', 8161), ('👉', 8153), ('✅', 7745), ('💙', 7455), ('💛', 7436), ('⚡', 7308), ('🤬', 7253), ('😔', 6783), ('😢', 6557), ('😊', 6435), ('🙄', 6432), ('🔥', 6428), ('🇷🇺', 6357), ('🥰', 6314), ('💪', 5836), ('🙈', 5446), ('😑', 5260), ('👌', 5132), ('😍', 4659), ('☢️', 4598), ('🤡', 4494), ('🤮', 4369), ('‼️', 4349), ('🤗', 4294), ('😎', 4276), ('☺️', 4069), ('😀', 4061), ('\U0001f972', 3712), ('🤦\u200d♀️', 3573), ('💬', 3527)]


In [7]:
# Messages with NaN
print(errors)

265092


## Word2Vec

In [8]:
def text_tokenize(texts):
    sentences = []
    for text in tqdm(texts):
        sentences.append(word_tokenize(text))
    return sentences

In [9]:
texts_with_emojis_sen = text_tokenize(texts_with_emojis)
texts_all_sen = text_tokenize(texts_all)

100%|████████████████████████████████████████████████████████████████████████| 645846/645846 [02:09<00:00, 4968.19it/s]
100%|██████████████████████████████████████████████████████████████████████| 3448344/3448344 [09:54<00:00, 5800.22it/s]


In [10]:
print(len(texts_all))
print(len(texts_all_sen))
print(len(texts_with_emojis))
print(len(texts_with_emojis_sen))

3448344
3448344
645846
645846


In [11]:
# Messages with emoji
model = Word2Vec(sentences=texts_with_emojis_sen, vector_size=100, window=5, min_count=1, workers=4)

In [12]:
# All messages
model2 = Word2Vec(sentences=texts_all_sen, vector_size=100, window=5, min_count=1, workers=4)

In [59]:
top_emoji = top50emojis[40][0]
print(top_emoji)

🤡


In [62]:
print(model.wv.most_similar(top_emoji,topn=20))

[('😂😂😂😂', 0.777965247631073), ('🤣🤣', 0.7725412845611572), ('🤣🤣🤣', 0.7670748829841614), ('🤦\u200d♂️', 0.7639925479888916), ('🤣', 0.7631723284721375), ('🤣🤣🤣🤣', 0.7521170377731323), ('😂😂😂', 0.743286669254303), ('😂😂', 0.7352780699729919), ('😆', 0.7122071385383606), ('😂', 0.7058059573173523), ('😁😁😁', 0.705274760723114), ('😏', 0.6950556039810181), ('🤣🤣🤣🤣🤣', 0.6936535835266113), ('😀', 0.6832446455955505), ('🤦\u200d♀', 0.6764146685600281), ('🤦', 0.6757794618606567), ('🧐', 0.6744260191917419), ('😁', 0.6738578677177429), ('🤭', 0.6716986894607544), ('🤦🏻\u200d♀️', 0.6709883809089661)]


In [64]:
print(model2.wv.most_similar(top_emoji,topn=20))

[('😂😂😂😂', 0.7047255635261536), ('🤣', 0.6800478100776672), ('🤣🤣🤣', 0.6687999367713928), ('😂😂😂', 0.657667338848114), ('😂😂', 0.6532021760940552), ('🤣🤣', 0.6367610692977905), ('🤣🤣🤣🤣', 0.6323269009590149), ('😂', 0.6307872533798218), ('😂😂😂😂😂', 0.6270245313644409), ('диванный', 0.5938810110092163), ('😀', 0.591812014579773), ('😁😁😁', 0.5916918516159058), ('Батутный', 0.5884559750556946), ('😆', 0.5842610001564026), ('😄', 0.5828202962875366), ('😁', 0.5762217044830322), ('😂😂😂😂😂😂', 0.5689116716384888), ('🤡🤡', 0.5672885179519653), ('клоун', 0.5595901608467102), ('🤦\u200d♂️', 0.5557526350021362)]
