In [68]:
import crawler
import graph_builder
import re
import json
import warnings

import emoji
import wget
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx

warnings.filterwarnings('ignore')
# wget.download('https://raw.githubusercontent.com/chalda-pnuzig/emojis.json/refs/heads/master/src/categories.json', 'data/categories.json')
hsc = crawler.HollyshopCrawling
gb = graph_builder.GraphBuilder

### Clear & Clean

In [69]:
# Read reviews and extract in-text emojis

df_hollyshop = hsc().read_reviews_from_csv()
df_hollyshop['review_emojis'] = df_hollyshop['review'].fillna('').apply(lambda x: ' '.join(re.findall(r'\:[a-zA-Z_0-9]+?\:', x)))
df_hollyshop['emojis'] = df_hollyshop['emojis'].fillna('') + ' ' + df_hollyshop['review_emojis']
df_hollyshop['emojis'] = df_hollyshop['emojis'].str.strip()
df_hollyshop.drop(columns=['review_emojis'], inplace=True)
df_hollyshop.to_csv('data/hollyshop_reviews.csv', index=False)

### Basic Visualization

In [70]:
# Analyze reviews containing emojis and plot the results

df_hollyshop['is_emoji'] = df_hollyshop['emojis'].apply(lambda x: True if x != '' else False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(df_hollyshop.is_emoji.value_counts(), color='#990000')
ax.set_xlabel('Contains Emoji', fontsize=14)
ax.set_ylabel('Number of Reviews', fontsize=14)
ax.set_title('Number of Hollyshop Reviews Containing Emojis', fontsize=16)
plt.xticks([0, 1], ['No', 'Yes'])
plt.bar_label(ax.containers[0], fmt='%d', fontsize=12)
plt.tight_layout()
plt.savefig('src/hollyshop_emoji_reviews_count.png', format='PNG')
plt.close()

In [71]:
# Convert emoji strings to actual emojis

def emoji_convert(emoji_str: str) -> list[str]:
    '''Convert emoji strings to actual emojis.
    Args:
        emoji_str (str): A string containing emoji codes separated by spaces.
    Returns:
        list[str]: A list of actual emojis.
    '''
    if emoji_str == '':
        return []
    emoji_list = emoji_str.split(' ')
    converted_emojis = []
    for emj in emoji_list:
        if emj == ':https:':
            continue
        if emj == ':smiling_face_with_3_hearts:':
            converted_emojis.append(emoji.emojize(':smiling_face_with_hearts:', language='alias'))
        else:
            converted_emojis.append(emoji.emojize(emj.replace('-', '_'), language='alias'))
    return converted_emojis

df_hollyshop['emojis_converted'] = df_hollyshop['emojis'].apply(emoji_convert)

In [72]:
# Count and save the top 15 most frequent emojis

emojis_number = df_hollyshop[df_hollyshop.is_emoji]['emojis_converted'].explode().value_counts().reset_index()
emojis_number_top_15 = emojis_number.head(15)
emojis_number_top_15.to_csv('data/hollyshop_top_15_emojis.csv', index=False)

In [73]:
# Plot the top 15 most frequent emojis

emojis_number = df_hollyshop[df_hollyshop.is_emoji]['emojis'].str.split(' ').explode().value_counts().reset_index()
emojis_number_top_15 = emojis_number.head(15)

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=emojis_number_top_15, x='emojis', y='count', color='#990000')
ax.set_xlabel('Emoji', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.set_title('Top 15 Most Frequent Emojis in Hollyshop Reviews', fontsize=16)
plt.xticks(rotation=90)
plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
plt.tight_layout()
plt.savefig('src/top_15_emojis_hollyshop.png', format='PNG')
plt.close()

### (Sub)Categories

In [74]:
# Map emojis to their categories and subcategories

with open('data/categories.json', 'r', encoding='utf-8') as f:
    categories_data = json.load(f)

emoji_to_category = {}

for category in categories_data['emojis']:
    for subcategory in categories_data['emojis'][category]:
        for emoji_entry in categories_data['emojis'][category][subcategory]:
            emoji_char = emoji_entry['emoji']
            emoji_to_category[emoji_char] = [category, subcategory]

In [75]:
# Assign categories and subcategories to emojis in the dataframe and save the result

emojis_number['emojis_converted'] = emojis_number['emojis'].apply(emoji_convert).str.join(' ')

for idx, row in emojis_number.iterrows():
    if row['emojis_converted'] in emoji_to_category:
        emojis_number.at[idx, 'category'] = emoji_to_category[row['emojis_converted']][0]
        emojis_number.at[idx, 'subcategory'] = emoji_to_category[row['emojis_converted']][1]
    elif 'heart' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Smileys & Emotion'
        emojis_number.at[idx, 'subcategory'] = 'heart'
    elif 'relaxed' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Smileys & Emotion'
        emojis_number.at[idx, 'subcategory'] = 'face-smiling'
    elif 'shamrock' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Animals & Nature'
        emojis_number.at[idx, 'subcategory'] = 'plant-other'
    elif 'check' in row['emojis'] or 'tm' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Symbols'
        emojis_number.at[idx, 'subcategory'] = 'other-symbol'
    elif 'frowning' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Smileys & Emotion'
        emojis_number.at[idx, 'subcategory'] = 'face-concerned'
    elif ':v:' == row['emojis']:
        emojis_number.at[idx, 'category'] = 'People & Body'
        emojis_number.at[idx, 'subcategory'] = 'hand-fingers-partial'
    elif 'plane' in row['emojis']:
        emojis_number.at[idx, 'category'] = 'Travel & Places'
        emojis_number.at[idx, 'subcategory'] = 'transport-air'
    else:
        emojis_number.drop(index=idx, inplace=True)

emojis_number.to_csv('data/hollyshop_emojis_with_categories.csv', index=False)

In [76]:
# Plot the top 15 most frequent emoji categories

categories_number = emojis_number.groupby('category')['count'].sum().sort_values(ascending=False).reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=categories_number, x='category', y='count', color='#990000')
ax.set_xlabel('Emoji Category', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.set_title('Top 15 Most Frequent Emoji Categories in Hollyshop Reviews', fontsize=16)
plt.xticks(rotation=90)
plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
plt.tight_layout()
plt.savefig('src/emoji_categories_hollyshop.png', format='PNG')
plt.close()

In [77]:
# Display rare emoji categories

emojis_number[~emojis_number['category'].isin(['Smileys & Emotion', 'People & Body'])].reset_index(drop=True)

Unnamed: 0,emojis,count,emojis_converted,category,subcategory
0,:fire:,36,üî•,Travel & Places,sky & weather
1,:shamrock:,16,‚òòÔ∏è,Animals & Nature,plant-other
2,:heavy_check_mark:,16,‚úîÔ∏è,Symbols,other-symbol
3,:sparkles:,13,‚ú®,Activities,event
4,:cherry_blossom:,11,üå∏,Animals & Nature,plant-flower
5,:rose:,8,üåπ,Animals & Nature,plant-flower
6,:star2:,6,üåü,Travel & Places,sky & weather
7,:hibiscus:,4,üå∫,Animals & Nature,plant-flower
8,:tulip:,3,üå∑,Animals & Nature,plant-flower
9,:butterfly:,2,ü¶ã,Animals & Nature,animal-bug


In [78]:
# Display emojis in the 'People & Body' category

emojis_number[emojis_number['category'].isin(['People & Body'])].reset_index(drop=True)

Unnamed: 0,emojis,count,emojis_converted,category,subcategory
0,:+1:,190,üëç,People & Body,hand-fingers-closed
1,:heart_hands:,28,ü´∂,People & Body,hands
2,:ok_hand:,23,üëå,People & Body,hand-fingers-partial
3,:v:,16,‚úåÔ∏è,People & Body,hand-fingers-partial
4,:pray:,15,üôè,People & Body,hands
5,:raised_hands:,14,üôå,People & Body,hands
6,:hand_with_index_finger_and_thumb_crossed:,11,ü´∞,People & Body,hand-fingers-partial
7,:pinched_fingers:,9,ü§å,People & Body,hand-fingers-partial
8,:woman-shrugging:,8,ü§∑‚Äç‚ôÄÔ∏è,People & Body,person-gesture
9,:-1:,4,üëé,People & Body,hand-fingers-closed


In [79]:
# Plot the top 15 most frequent emoji subcategories

subcategories_number = emojis_number.groupby('subcategory')['count'].sum().sort_values(ascending=False).reset_index()
subcategories_number = subcategories_number.head(15)

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=subcategories_number, x='subcategory', y='count', color='#990000')
ax.set_xlabel('Emoji Subcategory', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.set_title('Top 15 Most Frequent Emoji Subcategories in Hollyshop Reviews', fontsize=16)
plt.xticks(rotation=90)
plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
plt.tight_layout()
plt.savefig('src/top_15_emoji_subcategories_hollyshop.png', format='PNG')
plt.close()

### Co-occurrence

In [80]:
# Plot the top 10 most frequent numbers of emojis in reviews

n_emojis = df_hollyshop[df_hollyshop.is_emoji]['emojis'].str.split(' ').agg(len).value_counts()
n_emojis = n_emojis.head(10).reset_index().sort_values(by='count', ascending=False)

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=n_emojis, x='emojis', y='count', color='#990000')
ax.set_xlabel('Number of Emojis', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.set_title('Top 10 Most Frequent Numbers of Emojis in Hollyshop Reviews', fontsize=16)
plt.xticks(rotation=90)
plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
plt.tight_layout()
plt.savefig('src/top_10_number_of_emojis_hollyshop.png', format='PNG')
plt.close()

In [81]:
# Build and visualize the emoji co-occurrence graph

G = gb(df_hollyshop).build_graph()

plt.figure(figsize=(20, 20))

pos = nx.spring_layout(G, k=0.5, seed=42)
node_sizes = [G.nodes[n].get("frequency", 1) * 30 for n in G.nodes]
edge_widths = [G[u][v]["weight"] * 0.3 for u, v in G.edges]

nx.draw_networkx_nodes(G, pos, node_size=node_sizes, alpha=0.8, node_color="#990000")
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.5, edge_color="#990000")
nx.draw_networkx_labels(G, pos, font_size=7)

plt.axis("off")
plt.title("Emoji Co-occurrence Graph in Hollyshop Reviews", fontsize=20)
plt.tight_layout()
plt.savefig("src/emoji_graph.png", format="PNG")
plt.close()

### Sentiment & Emojis

In [82]:
# Plot the number of reviews by rating

df_hollyshop_rating_counts = df_hollyshop.groupby('rating').size().reset_index(name='count')

plt.figure(figsize=(10, 6))

ax = sns.barplot(df_hollyshop_rating_counts, x='rating', y='count', color='#990000')
ax.set_xlabel('Sentiment', fontsize=14)
ax.set_ylabel('Number of Reviews', fontsize=14)
ax.set_title('Number of Reviews by Rate in Hollyshop Reviews', fontsize=16)
plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
plt.tight_layout()
plt.savefig('src/hollyshop_rating_reviews_count.png', format='PNG')
plt.close()

In [83]:
# Mark positive and negative reviews

df_hollyshop['is_positive'] = df_hollyshop['rating'] >= 4
df_hollyshop_positive = df_hollyshop[df_hollyshop['rating'] >= 4]
df_hollyshop_negative = df_hollyshop[df_hollyshop['rating'] <= 3]

In [None]:
# Plot reviews with/without emojis by sentiment

counts_pos_emoji = df_hollyshop.groupby(['is_positive', 'is_emoji']).size().reset_index(name='count')

counts_pos_emoji['sentiment'] = counts_pos_emoji['is_positive'].map({True: 'Positive', False: 'Negative'})
counts_pos_emoji['emoji_label'] = counts_pos_emoji['is_emoji'].map({True: 'Has emoji(s)', False: 'No emoji(s)'})

plt.figure(figsize=(8, 6))

ax = sns.barplot(data=counts_pos_emoji, x='sentiment', y='count', hue='emoji_label', palette=['#990000', '#808000'])

for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=10)

ax.set_xlabel('')
ax.set_ylabel('Number of reviews', fontsize=12)
ax.set_title('Reviews with/without Emojis by Sentiment', fontsize=14)
ax.legend(title='Emoji presence')
plt.tight_layout()
plt.savefig('src/emoji_by_sentiment.png', format='PNG')
plt.close()

In [None]:
# Save positive and negative reviews and plot top 15 emojis for each sentiment

for df, sentiment in zip([df_hollyshop_positive, df_hollyshop_negative],['positive', 'negative']):
    df.to_csv(f'data/hollyshop_{sentiment}_reviews.csv', index=False)
    emojis_number_sent = df[df.is_emoji]['emojis_converted'].explode().value_counts()
    emojis_number_top_15_sent = emojis_number_sent.head(15).reset_index()
    emojis_number_top_15_sent.to_csv(f'data/hollyshop_top_15_emojis_{sentiment}.csv', index=False)

    emojis_number_sent = df[df.is_emoji]['emojis'].str.split(' ').explode().value_counts().reset_index()
    emojis_number_top_15_sent = emojis_number_sent.head(15)

    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=emojis_number_top_15_sent, x='emojis', y='count', color='#990000')
    ax.set_xlabel('Emoji', fontsize=14)
    ax.set_ylabel('Frequency', fontsize=14)
    ax.set_title(f'Top 15 Most Frequent Emojis in Hollyshop {sentiment.capitalize()} Reviews', fontsize=16)
    plt.xticks(rotation=90)
    plt.bar_label(ax.containers[0], fmt='%d', fontsize=8)
    plt.tight_layout()
    plt.savefig(f'src/top_15_emojis_hollyshop_{sentiment}.png', format='PNG')
    plt.close()

In [26]:
# Most popular negative (?) emojis

top_15_emojis = pd.read_csv('data/hollyshop_top_15_emojis.csv')
top_15_emojis_positive = pd.read_csv('data/hollyshop_top_15_emojis_positive.csv')
top_15_emojis_negative = pd.read_csv('data/hollyshop_top_15_emojis_negative.csv')


top_15_emojis_negative[top_15_emojis_negative['emojis_converted'].isin(top_15_emojis['emojis_converted'])].reset_index(drop=True)

Unnamed: 0,emojis_converted,count
0,üòÖ,4
1,üò≠,4
2,üòä,1


In [97]:

top_emojis = set(top_15_emojis_negative['emojis_converted'].astype(str).tolist())

df_score = df_hollyshop.copy()

df_score['has_top_negative_emoji'] = df_score['emojis_converted'].apply(lambda emojis: bool(set(emojis) & top_emojis))

matched = df_score[df_score['has_top_negative_emoji']]
total = len(matched)
negative = matched[~matched['is_positive']].shape[0]
positive = matched[matched['is_positive']].shape[0]

print(f"Total reviews with at least one top-15 negative emoji: {total}")
print(f"Negative (rating <= 3): {negative}")
print(f"Positive (rating >= 4): {positive}")

Total reviews with at least one top-15 negative emoji: 121
Negative (rating <= 3): 26
Positive (rating >= 4): 95
