In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from os import walk
import seaborn as sns
sns.set()

# Data Import and overview

In [2]:
DATASETS_PATH = "../../data/processed_tweets/"

In [3]:
def essemble_dataset(folder_path):
    df = pd.DataFrame()
    filenames = next(walk(folder_path), (None, None, []))[2]
    for filename in filenames:
        df_temp = pd.read_csv(filepath_or_buffer=folder_path + '/' + filename, sep=",", engine=None)
        df_temp = df_temp.sort_values(by='timestamp', ascending=True)
        df_temp = df_temp.drop(['index', 'Unnamed: 0'], axis=1)
        df = pd.concat([df, pd.DataFrame.from_records(df_temp)])
    return df

In [4]:
df = essemble_dataset(DATASETS_PATH)

In [5]:
df['year'].value_counts()

2021    533375
2019    531272
2020    531227
Name: year, dtype: int64

In [11]:
df.columns

Index(['tweet_id', 'text', 'timestamp', 'user_id', 'like_count',
       'retweet_count', 'quote_count', 'reply_count', 'reach', 'topics_ids',
       'topics', 'sentiment', 'popularity', 'followers', 'following',
       'tweet_count', 'verified', 'created_at', 'year', 'month', 'day_of_week',
       'day_phase', 'week_idx', 'day_phase_enc', 'day_of_week_enc',
       'month_enc', 'year_enc', 'sentiment_enc', 'verified_enc', 'seniority'],
      dtype='object')

## Tweets performance

In [10]:
topics_categories = df['topics_cleaned'].unique()[1:]
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
week_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_phases = ['Morning', 'Afternoon', 'Dusk', 'Night', 'Middle of the night']
day_phases_old = ['Dawn', 'Morning', 'Afternoon', 'Evening', 'Night']
sentiments = ['Negative', 'Neutral', 'Positive']
hashtags = [True, False]
offline_graphs = True

KeyError: 'topics_cleaned'

In [9]:
def retweets_likes_info_by_year(source_df, cols, cats_sort):
    df = pd.DataFrame()
    for y in np.sort(source_df['year'].unique()):
        dfy = source_df[source_df['year'] == y]
        df_all = dfy.groupby(cols).agg(
                                        **{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")},
                                        **{"retweets mean": pd.NamedAgg(column="retweet_count", aggfunc="mean")},
                                        **{"likes mean": pd.NamedAgg(column="like_count", aggfunc="mean")}).round(2)

        df_rets = dfy[dfy['retweet_count'] > 0].groupby(cols).agg(**{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")})
        df_likes = dfy[dfy['like_count'] > 0].groupby(cols).agg(**{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")})

        if len(cols) == 1:
            df_all = df_all.reindex(cats_sort).reset_index()
            df_rets = df_rets.reindex(cats_sort).reset_index()
            df_likes = df_likes.reindex(cats_sort).reset_index()
            df_all['year'] = [y for i in range(len(cats_sort))]
            df_all['% with retweets'] = np.round((df_rets["count " + cols[0]] / df_all["count " + cols[0]]) * 100, 2)
            df_all['% with likes'] = np.round((df_likes["count " + cols[0]] / df_all["count " + cols[0]]) * 100, 2)
        else:
            df_all = df_all.reset_index()
            df_rets = df_rets.reset_index()
            df_likes = df_likes.reset_index()

            year_items = []
            for cat in cats_sort:
                count = len(df_all[df_all[cols[0]] == cat][cols[1]].unique())
                year_items += [y for i in range(count)]

            df_all['year'] = year_items

            filter_rets = df_all.merge(df_rets,on=[cols[0], cols[1]])
            df_all['% with retweets'] = np.round((filter_rets["count " + cols[0] + "_y"] / filter_rets["count " + cols[0] + "_x"]) * 100, 2)

            filter_likes = df_all.merge(df_likes,on=[cols[0], cols[1]])
            df_all['% with likes'] = np.round((filter_likes["count " + cols[0] + "_y"] / filter_likes["count " + cols[0] + "_x"]) * 100, 2)

        df_all['sum'] = [df_all["count " + cols[0]].sum() for i in range(df_all.shape[0])]
        df_all['% ' + cols[0]] = (df_all["count " + cols[0]] / df_all['sum']) * 100

        df = pd.concat([df, pd.DataFrame.from_records(df_all)])

    if len(cols) == 1:
        return df[['year', cols[0], "count " + cols[0], '% ' + cols[0], '% with retweets', '% with likes', 'retweets mean', 'likes mean']]
    else:
        return df[['year', cols[0], cols[1], "count " + cols[0], '% ' + cols[0], '% with retweets', '% with likes', 'retweets mean', 'likes mean']]

In [71]:
def analysis_chart(df, x_col, y_bar, y_line, x_name, y_bar_name, y_line_name, plot_title):
    palette = ['rgb(136, 204, 238)', 'rgb(204, 102, 119)', 'rgb(221, 204, 119)', 'rgb(51, 34, 136)', '#D62728', '#FF9900', 'rgb(170, 68,            153)', 'rgb(68, 170, 153)', 'rgb(153, 153, 51)', 'rgb(136, 34, 85)', 'rgb(102, 17, 0)', 'rgb(136, 136, 136)']

    fig = make_subplots(specs=[[{"secondary_y": True}]])
    years = np.sort(df['year'].unique())
    years_count = len(df['year'].unique())

    for i in range(years_count):
        year = years[i]
        dfy = df[df['year'] == year]
        fig.add_trace(go.Bar(x=dfy[x_col], y=dfy[y_bar], text=list(map(str, dfy[y_bar].tolist())), name=y_bar_name + ' ' + str(year), marker_color=palette[i], width=0.28, textposition='inside'), secondary_y=False)
        fig.add_trace(go.Scatter(x=dfy[x_col], y=dfy[y_line], name=y_line_name + ' ' + str(year), marker_color=palette[i+years_count]), secondary_y=True)

    fig.update_yaxes(title_text=y_line_name, secondary_y=True)
    fig.update_yaxes(title_text=y_bar_name, secondary_y=False)
    fig.update_layout(title_text=plot_title, width=900, height=500)
    fig.update_xaxes(title_text=x_name)
    fig.show()

In [97]:
def multiple_analysis_chart(df, x, y, color, text, title, x_title, y_title, offline):
    df.year = df.year.astype(str)
    fig = px.bar(df, x=x, y=y, color=color, text=text, title=title, width=900, height=500, barmode="group", color_discrete_sequence=px.colors.qualitative.Safe)
    fig.update_xaxes(title_text=x_title)
    fig.update_yaxes(title_text=y_title)
    fig.show()
    if offline:
        plotly.offline.plot(fig, filename='../../data/charts/' + title + '.html')

In [72]:
tweet_analysis = df[['text', 'year', 'day_phase', 'day_of_week', 'month', 'retweet_count', 'quote_count', 'like_count', 'reply_count', 'sentiment', 'hashtags', 'topics_cleaned']]

### Average retweet and like count per phase of the day

In [73]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_phase'], day_phases)
df_analysis

Unnamed: 0,year,day_phase,count day_phase,% day_phase,% with retweets,% with likes,retweets mean,likes mean
0,2019,Morning,105912,19.935551,14.37,44.76,0.41,2.45
1,2019,Afternoon,106273,20.003501,17.78,48.18,0.56,3.4
2,2019,Dusk,106415,20.030229,18.12,50.0,0.56,3.62
3,2019,Night,106312,20.010842,17.73,49.82,0.54,3.64
4,2019,Middle of the night,106360,20.019877,17.32,50.74,0.52,3.68
0,2020,Morning,106274,19.866453,14.31,45.62,0.71,4.49
1,2020,Afternoon,107278,20.054137,17.34,48.07,1.81,10.88
2,2020,Dusk,107116,20.023853,17.36,50.25,1.33,7.87
3,2020,Night,107195,20.038621,17.17,50.73,2.2,11.83
4,2020,Middle of the night,107079,20.016936,16.83,51.44,1.61,9.29


In [74]:
analysis_chart(df_analysis, 'day_phase', '% with retweets', '% with likes', 'Day phase', '% with retweets', '% with likes', 'Percentage of retweets and likes during the day')
analysis_chart(df_analysis, 'day_phase', 'retweets mean', 'likes mean', 'Day phase', 'Retweets mean', 'Likes mean', 'Average retweets and likes during the day')

### Average retweet and like count during the week

In [17]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_of_week'], week_days)
df_analysis

Unnamed: 0,year,day_of_week,count day_of_week,% day_of_week,% with retweets,% with >5 likes,retweets mean,likes mean
0,2020,Monday,75963,14.200231,16.76,49.12,1.0,6.54
1,2020,Tuesday,75952,14.198175,16.88,48.73,2.72,15.23
2,2020,Wednesday,77453,14.478766,16.71,49.2,1.39,7.91
3,2020,Thursday,77487,14.485122,16.54,48.94,1.37,7.84
4,2020,Friday,76047,14.215934,16.73,49.6,1.54,8.34
5,2020,Saturday,76020,14.210886,16.49,49.79,0.96,6.96
6,2020,Sunday,76020,14.210886,16.13,49.22,1.75,9.38
0,2021,Monday,76477,14.229735,16.66,49.68,1.19,9.38
1,2021,Tuesday,76504,14.234759,16.8,49.95,1.78,11.17
2,2021,Wednesday,76592,14.251133,16.61,50.14,2.52,12.11


In [18]:
analysis_chart(df_analysis, 'day_of_week', '% with retweets', '% with likes', 'Weekday', '% with retweets',
               '% with likes', 'Percentage of retweets and likes during the week')
analysis_chart(df_analysis, 'day_of_week', 'retweets mean', 'likes mean', 'Weekday', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the week')

### Average retweet count per month

In [19]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['month'], months)
df_analysis

Unnamed: 0,year,month,count month,% month,% with retweets,% with >5 likes,retweets mean,likes mean
0,2020,January,45441,8.494566,17.9,50.7,1.51,9.03
1,2020,February,42575,7.958807,18.52,51.29,1.25,7.91
2,2020,March,45494,8.504473,18.07,51.71,1.66,8.91
3,2020,April,44009,8.226873,17.19,50.81,0.99,6.37
4,2020,May,45376,8.482415,17.34,50.68,3.48,17.77
5,2020,June,43815,8.190608,18.09,48.83,1.64,10.18
6,2020,July,45297,8.467647,16.94,48.91,1.38,8.63
7,2020,August,45165,8.442971,16.3,48.34,1.8,9.34
8,2020,September,43789,8.185747,16.0,47.11,2.0,9.58
9,2020,October,45245,8.457926,15.71,47.17,1.32,6.93


In [20]:
analysis_chart(df_analysis, 'month', '% with retweets', '% with likes', 'Month', '% with retweets',
               '% with likes', 'Percentage of retweets and likes during the year')
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Month', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the year')

### Tweets performance by sentiment

In [21]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['sentiment'], sentiments)
df_analysis

Unnamed: 0,year,sentiment,count sentiment,% sentiment,% with retweets,% with >5 likes,retweets mean,likes mean
0,2020,Negative,131949,24.666039,16.11,48.4,1.14,6.37
1,2020,Neutral,166992,31.216842,14.59,44.09,1.31,8.08
2,2020,Positive,236001,44.117119,18.31,53.33,1.9,10.84
0,2021,Negative,122139,22.72586,15.65,49.02,2.28,13.5
1,2021,Neutral,182420,33.942078,13.91,43.82,0.95,8.05
2,2021,Positive,232886,43.332062,19.03,55.91,1.61,12.04
0,2019,Negative,251978,23.714594,16.18,46.8,0.52,2.9
1,2019,Neutral,327442,30.816794,15.18,44.15,0.42,2.8
2,2019,Positive,483124,45.468611,18.8,52.78,0.59,3.98


In [22]:
analysis_chart(df_analysis, 'sentiment', '% with retweets', '% with likes', 'Sentiment', 'Retweet count',
               'Likes count', 'Percentage of retweets and likes by sentiment')
analysis_chart(df_analysis, 'sentiment', 'retweets mean', 'likes mean', 'Sentiment', 'Retweets mean', 'Likes mean',
               'Average retweets and likes number by sentiment')

### Tweets performance by topics

In [76]:
topic_analysis = tweet_analysis[tweet_analysis['topics_cleaned'].notnull()].copy()
topic_analysis['topics_cleaned'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

Person                  20.8%
TV and Movies           18.4%
Brand                   16.4%
Interest and Hobbies    14.5%
Entities                10.3%
Other                    6.5%
Sport                    4.1%
News                     4.0%
Holiday                  1.8%
Video Game               1.2%
Music                    1.0%
Political                0.8%
Book                     0.2%
Name: topics_cleaned, dtype: object

#### Performance of each topic in retweets and likes

In [28]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['topics_cleaned'], topics_categories)
df_analysis

Unnamed: 0,year,topics_cleaned,count topics_cleaned,% topics_cleaned,% with retweets,% with >5 likes,retweets mean,likes mean
0,2019,Holiday,2.0,0.000515,,100.0,0.0,13.0
1,2019,Brand,59608.0,15.363916,16.23,48.83,0.5,3.17
2,2019,Person,73362.0,18.908999,15.45,44.01,0.58,3.3
3,2019,Interest and Hobbies,53902.0,13.893199,15.07,48.49,0.42,3.15
4,2019,Sport,14652.0,3.776542,22.17,59.1,0.66,5.54
5,2019,TV and Movies,68948.0,17.771294,15.33,51.57,0.47,3.91
6,2019,Other,64646.0,16.662457,16.14,46.38,0.52,3.3
7,2019,Video Game,3970.0,1.023264,16.07,55.01,0.4,3.83
8,2019,Entities,41540.0,10.706903,15.13,50.12,0.43,3.21
9,2019,Political,2856.0,0.736132,14.99,31.86,1.05,2.02


In [29]:
analysis_chart(df_analysis, 'topics_cleaned', '% with retweets', '% with likes', 'Topics', 'Retweet count',
               'Likes count', 'Percentage of retweets and likes by topic')
analysis_chart(df_analysis, 'topics_cleaned', 'retweets mean', 'likes mean', 'Topics', 'Retweets mean', 'Likes mean',
               'Average tweets performance by topic')

#### Average retweet count per topic during the day

In [44]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_phase', 'topics_cleaned'], day_phases)
df_analysis

Unnamed: 0,year,day_phase,topics_cleaned,count day_phase,% day_phase,% with retweets,% with likes,retweets mean,likes mean
0,2019,Afternoon,Book,32,0.016496,37.50,59.38,1.12,8.09
1,2019,Afternoon,Brand,6028,3.107425,17.47,48.59,0.49,3.13
2,2019,Afternoon,Entities,4331,2.232624,16.37,50.13,0.50,3.28
3,2019,Afternoon,Interest and Hobbies,5915,3.049173,16.04,48.45,0.49,3.34
4,2019,Afternoon,Music,383,0.197436,21.41,48.56,0.48,2.92
...,...,...,...,...,...,...,...,...,...
60,2021,Night,Person,6554,3.728885,18.19,51.36,1.97,13.57
61,2021,Night,Political,307,0.174667,22.15,46.58,1.97,13.71
62,2021,Night,Sport,2027,1.153258,24.57,62.75,0.78,9.60
63,2021,Night,TV and Movies,8739,4.972036,16.82,56.59,1.09,12.34


In [45]:
multiple_analysis_chart(df_analysis, "day_phase", "% with retweets", "year", "topics_cleaned", "Percentage of retweets by topic during the day",
                            "Day phase", "% with retweets", offline_graphs)

'../../data/charts/Average retweet count per topic during the day in 2020.html'

#### Average retweet count per topic during the week

In [92]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_of_week', 'topics_cleaned'], week_days)
df_analysis

Unnamed: 0,year,day_of_week,topics_cleaned,count day_of_week,% day_of_week,% with retweets,% with likes,retweets mean,likes mean
0,2019,Friday,Book,30,0.015465,20.00,56.67,0.20,2.77
1,2019,Friday,Brand,4227,2.179012,16.63,48.78,0.50,3.23
2,2019,Friday,Entities,2987,1.539794,15.03,49.15,0.40,3.21
3,2019,Friday,Interest and Hobbies,3863,1.991371,13.72,47.04,0.40,2.96
4,2019,Friday,Music,347,0.178878,15.56,49.28,0.26,1.88
...,...,...,...,...,...,...,...,...,...
86,2021,Wednesday,Person,4912,2.794672,17.41,48.94,2.61,15.92
87,2021,Wednesday,Political,246,0.139961,21.95,45.53,1.48,5.16
88,2021,Wednesday,Sport,1100,0.625843,26.45,64.36,1.27,15.08
89,2021,Wednesday,TV and Movies,5427,3.087681,16.10,55.24,1.65,13.46


In [96]:
multiple_analysis_chart(df_analysis, "day_of_week", "% with retweets", "year", "topics_cleaned", "Percentage of retweets by topic during the week",
                            "Weekday", "% with retweets", offline_graphs)

#### Average retweet count per topic during the year

In [47]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['month', 'topics_cleaned'], months)
df_analysis

Unnamed: 0,year,month,topics_cleaned,count month,% month,% with retweets,% with likes,retweets mean,likes mean
0,2019,April,Book,24,0.012372,37.50,58.33,1.96,8.12
1,2019,April,Brand,2303,1.187193,15.59,47.37,0.41,3.08
2,2019,April,Entities,1490,0.768093,15.64,49.93,0.36,2.85
3,2019,April,Interest and Hobbies,2183,1.125333,16.31,47.82,0.42,3.06
4,2019,April,Music,174,0.089697,24.14,55.75,0.53,3.22
...,...,...,...,...,...,...,...,...,...
151,2021,September,Person,2571,1.462765,18.28,49.98,2.96,16.27
152,2021,September,Political,118,0.067136,22.03,42.37,0.82,2.72
153,2021,September,Sport,550,0.312921,24.91,59.64,0.86,13.52
154,2021,September,TV and Movies,4051,2.304808,16.19,54.48,1.84,14.86


In [118]:
multiple_analysis_chart(df_analysis, "month", "% with retweets", "year", "topics_cleaned", "Percentage of retweets by topic during the year",
                        "Month", "% with retweets", offline_graphs)

'../../data/charts/Average retweet count per topic during during the year 2020.html'

#### Impact of hashtags in topic popularity

In [89]:
df_analysis = retweets_likes_info_by_year(topic_analysis, ['hashtags', 'topics_cleaned'], hashtags)
df_analysis

Unnamed: 0,year,hashtags,topics_cleaned,count hashtags,% hashtags,% with retweets,% with likes,retweets mean,likes mean
0,2019,False,Book,131,0.067530,24.43,58.78,0.68,6.40
1,2019,False,Brand,23448,12.087408,15.09,49.17,0.46,3.09
2,2019,False,Entities,15986,8.240758,14.14,50.78,0.39,3.14
3,2019,False,Interest and Hobbies,18669,9.623841,14.95,52.38,0.43,3.42
4,2019,False,Music,1394,0.718605,19.15,54.23,0.50,3.49
...,...,...,...,...,...,...,...,...,...
21,2021,True,Person,4939,2.810034,25.67,53.67,1.64,10.49
22,2021,True,Political,253,0.143944,30.04,47.43,1.35,4.59
23,2021,True,Sport,2919,1.660759,27.10,63.58,1.06,9.79
24,2021,True,TV and Movies,14596,8.304364,19.53,58.45,1.06,11.85


In [90]:
multiple_analysis_chart(df_analysis, "topics_cleaned", "% with retweets", "year", "hashtags", "Hashtags presence by topic and corresponding % retweet count",
                        "Topics", "% with retweets", offline_graphs)

#### Tweet sentiment per topic

In [58]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['sentiment', 'topics_cleaned'], sentiments)
df_analysis

Unnamed: 0,year,sentiment,topics_cleaned,count sentiment,% sentiment,% with retweets,% with likes,retweets mean,likes mean
0,2019,Negative,Book,44,0.022682,15.91,47.73,0.25,1.86
1,2019,Negative,Brand,6595,3.399712,15.71,49.78,0.64,3.09
2,2019,Negative,Entities,4063,2.094470,15.46,50.11,0.58,3.62
3,2019,Negative,Interest and Hobbies,5987,3.086289,14.26,44.95,0.42,2.84
4,2019,Negative,Music,388,0.200013,19.07,49.74,0.52,3.17
...,...,...,...,...,...,...,...,...,...
34,2021,Positive,Person,13711,7.800845,18.57,52.70,2.13,16.26
35,2021,Positive,Political,571,0.324869,24.87,51.66,2.41,12.08
36,2021,Positive,Sport,4285,2.437942,27.58,69.40,1.00,11.63
37,2021,Positive,TV and Movies,19115,10.875440,17.92,58.45,1.29,13.13


In [None]:
multiple_analysis_chart(df_analysis, "topics_cleaned", "% with retweets", "year", "sentiment", "Tweet sentiment by topic and corresponding % retweet count",
                        "Topics", "% with retweets", offline_graphs)