In [58]:
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from os import walk
import datetime
from statistics import mean
from dateutil.relativedelta import relativedelta
import seaborn as sns
sns.set()

# Data Import and overview

In [31]:
DATASETS_PATH = "../../data/processed_tweets/"
DATASETS_RETWEETS_PATH = "../../data/processed_retweets/"

processed_filenames = next(walk(DATASETS_PATH), (None, None, []))[2]
filenames = [DATASETS_PATH + "/" + filename for filename in processed_filenames]

processed_retweets_filenames = next(walk(DATASETS_RETWEETS_PATH), (None, None, []))[2]
retweets_filenames = [DATASETS_RETWEETS_PATH + "/" + filename for filename in processed_retweets_filenames]

In [32]:
def essemble_dataset(filenames):
    df = pd.DataFrame()
    for filename in filenames:
        df_temp = pd.read_csv(filepath_or_buffer=filename, sep=",", engine=None)
        df_temp['timestamp'] = pd.to_datetime(df_temp['timestamp'])
        df_temp['timestamp'] = [i.replace(tzinfo=datetime.timezone.utc) for i in df_temp['timestamp']]
        df_temp = df_temp.sort_values(by='timestamp', ascending=True)
        df_temp = df_temp.drop(['index', 'Unnamed: 0'], axis=1)
        df = pd.concat([df, pd.DataFrame.from_records(df_temp)])
    return df.reset_index()

In [33]:
df = essemble_dataset(filenames)
df

Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,reply_count,reach,...,day_phase,week_idx,day_phase_enc,day_of_week_enc,month_enc,year_enc,sentiment_enc,verified_enc,hashtags_enc,seniority
0,0,1079904885246705665,I’ll never salt you down 😌,2019-01-01 00:59:22,497001647,0,0,0,0,1468,...,Middle of the night,2019-01,2,5,4,0,2,0,0,10
1,1,1079904884445581312,You know who you are; but some of you I’ve fol...,2019-01-01 00:59:22,112542289,2,1,0,0,2969,...,Middle of the night,2019-01,2,5,4,0,2,0,0,12
2,2,1079904885175476224,Oh she a freak freak https://t.co/uJqYxJUGgg,2019-01-01 00:59:22,2372981097,0,0,0,0,809,...,Middle of the night,2019-01,2,5,4,0,0,0,0,8
3,3,1079904885531791360,Don’t forget your red thong ladies,2019-01-01 00:59:23,2446281619,2,0,0,0,112,...,Middle of the night,2019-01,2,5,4,0,0,0,0,8
4,4,1079904886513430528,Mirage Mirror Hour of Devastation Artist Greg ...,2019-01-01 00:59:23,71949265,0,0,0,0,49,...,Middle of the night,2019-01,2,5,4,0,0,0,0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595869,533370,1477036834261770241,…. them crawfish was so good yall lmfao,2021-12-31 21:59:57,795707439219884037,0,0,0,0,1826,...,Night,2021-52,4,0,2,0,2,0,0,5
1595870,533371,1477036835603836932,Damn all the freaks are at the New Bev right n...,2021-12-31 21:59:58,34679503,26,0,0,0,10636,...,Night,2021-52,4,0,2,0,1,0,0,13
1595871,533372,1477036838217035777,Fast And Professional Vehicle Lockout Service ...,2021-12-31 21:59:58,210241982,0,0,0,0,394,...,Night,2021-52,4,0,2,0,0,0,0,11
1595872,533373,1477036840024567809,Very much this https://t.co/b3bgtstLfr,2021-12-31 21:59:59,883576549,3,0,0,0,702,...,Night,2021-52,4,0,2,0,1,0,0,9


In [34]:
df['year'].value_counts()

2021    533375
2019    531272
2020    531227
Name: year, dtype: int64

In [38]:
df_retweets_info = essemble_dataset(retweets_filenames)
df_retweets_info

Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,reply_count,referenced_tweets,followers,following,tweet_count,verified,created_at,ref_tweed_id
0,0,1079904986971156480,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 00:59:47,863014342538776577,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",2589,4948,235553,False,2017-05-12 12:53:48,1079904942729523200
1,1,1079905009767133185,RT @IrwindalePolice: The second group of @Rose...,2019-01-01 00:59:52,67143433,0,10,0,0,"[ReferencedTweet(id: 1079904903596666880, type...",20384,7487,14743,True,2009-08-19 22:29:24,1079904903596666880
2,2,1079905092256624642,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:12,887062199189078020,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",26,89,1617,False,2017-07-17 21:31:24,1079904942729523200
3,3,1079905147898286080,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:25,3069273057,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",262,873,6195,False,2015-03-04 06:11:31,1079904942729523200
4,4,1079905204521250816,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:39,996735592275705856,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",14,140,93,False,2018-05-16 12:54:18,1079904942729523200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401969,105741,1542057750598934528,RT @Bandzzmollydon: Bitches be killing me subb...,2022-06-29 08:09:52,1477447673745657863,0,47,0,0,"[ReferencedTweet(id: 1384990228373557248, type...",32,34,130,False,2022-01-02 01:12:42,1384990228373557248
401970,105742,1542081917050314752,RT @theRAWbussy: Let me spread that pretty pin...,2022-06-29 09:45:54,1105220957839462402,0,257,0,0,"[ReferencedTweet(id: 1400255663394144257, type...",1715,4999,86338,False,2019-03-11 21:36:25,1400255663394144257
401971,105743,1542254711469793280,RT @Evanknxxx: honestly i’m at my horniest aft...,2022-06-29 21:12:31,1537565677908856837,0,180,0,0,"[ReferencedTweet(id: 1401342747819249671, type...",3,30,139,False,2022-06-16 22:40:10,1401342747819249671
401972,105744,1542263990444851201,RT @Tsmckenziee: #𝑅𝐸𝒯𝒲𝐸𝐸𝒯 𝐼𝐹 𝐼 𝒞𝒜𝒩 𝒞𝑅𝐸𝒜𝑀 𝒪𝒩 𝒴𝒪...,2022-06-29 21:49:23,1350913747209117697,0,699,0,0,"[ReferencedTweet(id: 1469064283354972173, type...",138,1854,3849,False,2021-01-17 21:12:15,1469064283354972173


Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,quote_count,reply_count,referenced_tweets,followers,following,tweet_count,verified,created_at,ref_tweed_id
0,0,1079904986971156480,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 00:59:47,863014342538776577,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",2589,4948,235553,False,2017-05-12 12:53:48,1079904942729523200
1,1,1079905009767133185,RT @IrwindalePolice: The second group of @Rose...,2019-01-01 00:59:52,67143433,0,10,0,0,"[ReferencedTweet(id: 1079904903596666880, type...",20384,7487,14743,True,2009-08-19 22:29:24,1079904903596666880
2,2,1079905092256624642,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:12,887062199189078020,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",26,89,1617,False,2017-07-17 21:31:24,1079904942729523200
3,3,1079905147898286080,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:25,3069273057,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",262,873,6195,False,2015-03-04 06:11:31,1079904942729523200
4,4,1079905204521250816,"RT @FaZeClan: January, 2018 https://t.co/bxyyn...",2019-01-01 01:00:39,996735592275705856,0,171,0,0,"[ReferencedTweet(id: 1079904942729523200, type...",14,140,93,False,2018-05-16 12:54:18,1079904942729523200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401969,105741,1542057750598934528,RT @Bandzzmollydon: Bitches be killing me subb...,2022-06-29 08:09:52,1477447673745657863,0,47,0,0,"[ReferencedTweet(id: 1384990228373557248, type...",32,34,130,False,2022-01-02 01:12:42,1384990228373557248
401970,105742,1542081917050314752,RT @theRAWbussy: Let me spread that pretty pin...,2022-06-29 09:45:54,1105220957839462402,0,257,0,0,"[ReferencedTweet(id: 1400255663394144257, type...",1715,4999,86338,False,2019-03-11 21:36:25,1400255663394144257
401971,105743,1542254711469793280,RT @Evanknxxx: honestly i’m at my horniest aft...,2022-06-29 21:12:31,1537565677908856837,0,180,0,0,"[ReferencedTweet(id: 1401342747819249671, type...",3,30,139,False,2022-06-16 22:40:10,1401342747819249671
401972,105744,1542263990444851201,RT @Tsmckenziee: #𝑅𝐸𝒯𝒲𝐸𝐸𝒯 𝐼𝐹 𝐼 𝒞𝒜𝒩 𝒞𝑅𝐸𝒜𝑀 𝒪𝒩 𝒴𝒪...,2022-06-29 21:49:23,1350913747209117697,0,699,0,0,"[ReferencedTweet(id: 1469064283354972173, type...",138,1854,3849,False,2021-01-17 21:12:15,1469064283354972173


In [175]:
topics_categories = ['Brand', 'Holiday', 'Person', 'Interest and Hobbies', 'Sport',
       'TV and Movies', 'Other', 'Video Game', 'Entities', 'Political',
       'Music', 'Book', 'News']
palette = ['#006D77', '#FBD1A2', '#7DCFB6', '#00B2CA', '#1D4E89', '#F79256', '#aed9e0', '#b8f2e6', '#faf3dd', '#ffa69e', '#FE7F2D', '#FCCA46', '#ED7B84', '#F92A82']
offline_charts = False

## Retweets Analysis

### Analysing average followers count between shared tweets and not shared by topic

In [193]:
df_test = pd.DataFrame()
cols = ['topics_cleaned']
cats_sort = topics_categories

for y in np.sort(df['year'].unique()):
    dfy = df[df['year'] == y]
    df_all = dfy.groupby(cols).agg(
                                    **{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")},
                                    **{"retweets count": pd.NamedAgg(column="retweet_count", aggfunc="count")},
                                    **{"retweets mean": pd.NamedAgg(column="retweet_count", aggfunc="mean")},
                                    **{"followers": pd.NamedAgg(column="followers", aggfunc="mean")},
                                    )
    df_rets = dfy[dfy['retweet_count'] > 0].groupby(cols).agg(
                                                                **{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")},
                                                                **{"followers": pd.NamedAgg(column="followers", aggfunc="mean")})
    df_likes = dfy[dfy['like_count'] > 0].groupby(cols).agg(
                                                            **{"count " + cols[0]: pd.NamedAgg(column=cols[0], aggfunc="count")},
                                                            **{"followers": pd.NamedAgg(column="followers", aggfunc="mean")})

    df_all = df_all.reindex(cats_sort).reset_index()
    df_rets = df_rets.reindex(cats_sort).reset_index()
    df_likes = df_likes.reindex(cats_sort).reset_index()

    df_all['year'] = [str(y) for i in range(len(cats_sort))]
    df_all['sum'] = [df_all["count " + cols[0]].sum() for i in range(df_all.shape[0])]
    df_all['% ' + cols[0]] = (df_all["count " + cols[0]] / df_all['sum']) * 100
    df_all['% with retweets'] = np.round((df_rets["count " + cols[0]] / df_all["count " + cols[0]]) * 100, 2)
    df_all['% with likes'] = np.round((df_likes["count " + cols[0]] / df_all["count " + cols[0]]) * 100, 2)

    df_all['followers mean'] = df_all['followers']
    df_all['shared followers mean'] = df_rets['followers']

    df_test = pd.concat([df_test, pd.DataFrame.from_records(df_all)])

df_test[['year', cols[0], "count " + cols[0], '% ' + cols[0], '% with retweets', '% with likes', 'followers mean', 'shared followers mean']]

Unnamed: 0,year,topics_cleaned,count topics_cleaned,% topics_cleaned,% with retweets,% with likes,followers mean,shared followers mean
0,2019,Brand,29804.0,15.363916,16.23,48.83,2360.986277,5057.219099
1,2019,Holiday,1.0,0.000515,,100.0,764.0,
2,2019,Person,36681.0,18.908999,15.45,44.01,2927.879829,7395.9716
3,2019,Interest and Hobbies,26951.0,13.893199,15.07,48.49,2168.93555,4289.90325
4,2019,Sport,7326.0,3.776542,22.17,59.1,2917.076986,6455.002463
5,2019,TV and Movies,34474.0,17.771294,15.33,51.57,2588.880345,6761.92676
6,2019,Other,32323.0,16.662457,16.14,46.38,2533.108406,6039.042937
7,2019,Video Game,1985.0,1.023264,16.07,55.01,2197.337531,5319.523511
8,2019,Entities,20770.0,10.706903,15.13,50.12,2009.895811,4147.76289
9,2019,Political,1428.0,0.736132,14.99,31.86,3175.37535,6970.850467


In [194]:
fig = px.bar(df_test, x="topics_cleaned", y="shared followers mean", color="year", color_discrete_sequence=palette, barmode="group",
             title="Average followers between shared tweets and not shared by topic", width=900, height=500)
fig.update_yaxes(title_text="Average followers count")
fig.show()
#plotly.offline.plot(fig, filename='../../data/charts/Average followers between shared tweets and not shared by topic.html')

### Analysing retweeters characteristics

In [109]:
def get_avg_followers_retweets(og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    return int(matches['followers'].mean())

In [110]:
def time_diff_from_original_old(og_tweet_time, retweet_time):
    return abs(og_tweet_time.replace(tzinfo=datetime.timezone.utc) - retweet_time.replace(tzinfo=datetime.timezone.utc)).total_seconds() / 3600.0

In [111]:
def time_diff_from_original(og_tweet_time, retweet_time):
    return abs(og_tweet_time - retweet_time).total_seconds() / 3600.0

In [112]:
def get_avg_retweets_time(og_tweet_time, og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    t = [time_diff_from_original(og_tweet_time, retweet_time) for retweet_time in matches['timestamp']]
    return mean(t)

In [113]:
def get_avg_retweets_account_age_old(og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    t = [(datetime.datetime.now().replace(tzinfo=datetime.timezone.utc) - created_at.replace(tzinfo=datetime.timezone.utc)) / np.timedelta64(1, 'Y') for created_at in matches['created_at']]
    return mean(t)

In [114]:
def get_avg_retweets_account_age(og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1

    ages = pd.to_datetime(matches['created_at'], utc=True).dt.strftime("%Y-%m-%d")
    ages = pd.to_datetime(ages)
    ages = ages.apply(lambda x: relativedelta(datetime.datetime.now(), x).years)

    return ages.mean()

In [115]:
def get_retweets_half_time(og_tweet_time, og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    fist_half_retweet_time = matches.iloc[int(len(matches)/2)]['timestamp']
    fist_half_retweet_time_diff = time_diff_from_original(og_tweet_time, fist_half_retweet_time)
    return fist_half_retweet_time_diff

In [116]:
def get_retweets_total_time(og_tweet_time, og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    total_retweet_time = matches.iloc[int(len(matches) - 1)]['timestamp']
    total_retweet_time_diff = time_diff_from_original(og_tweet_time, total_retweet_time)
    return total_retweet_time_diff

In [117]:
def get_retweets_followers_split(og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    return matches[:int(len(matches)/2)]['followers'].mean()

In [118]:
def get_retweets_followers_second_split(og_tweet_id):
    matches = df_retweets_info[df_retweets_info['ref_tweed_id'] == og_tweet_id]
    if matches.shape[0] == 0: return -1
    return matches[:int(len(matches) - 1)]['followers'].mean()

In [181]:
def retweeters_characteristics(min_retweets):
    df_final = pd.DataFrame()
    for y in np.sort(df['year'].unique()):

        dfy = df[df['year'] == y]
        cats_vals = []

        topics = dfy[~dfy['topics_cleaned'].isna()]['topics_cleaned'].unique()
        topics = sorted(topics,key=topics_categories.index)

        for topic in topics:
            tweets_by_topic = dfy[(dfy['topics_cleaned'] == topic) & (dfy['retweet_count'] > min_retweets)].copy()

            cat_vals = dict()
            cat_vals['Year'] = str(y)
            cat_vals['Topic'] = topic

            get_basic_analysis(tweets_by_topic, cat_vals)
            get_timing_analysis(tweets_by_topic, cat_vals)

            cats_vals.append(cat_vals)


        df_year = pd.DataFrame(cats_vals, columns=['Topic', 'Average Retweeters Followers', 'Average Retweets Time', 'Average Retweeters Account Age', 'Average Retweeters Half Time', 'Average Retweeters Total Time', '% Time to get 50% retweets', 'Average Retweeters Followers First Half', 'Average Retweeters Followers Second Half'])
        df_year['Year'] = [str(y) for i in range(len(topics))]
        df_final = pd.concat([df_final, pd.DataFrame.from_records(df_year)])

    return df_final


def get_basic_analysis(df, topic_values):
    df['avg_retweeters_followers'] = [get_avg_followers_retweets(x) for x in zip(df['tweet_id'])]
    df['avg_retweeters_time'] = [get_avg_retweets_time(x, y) for x, y in zip(df['timestamp'], df['tweet_id'])]
    df['avg_retweeters_account_age'] = [get_avg_retweets_account_age(x) for x in df['tweet_id']]

    avg_retweeters_followers = df[df['avg_retweeters_followers'] != -1]['avg_retweeters_followers'].mean()
    avg_retweeters_time = df[df['avg_retweeters_time'] != -1]['avg_retweeters_time'].mean()
    avg_retweeters_account_age = df[df['avg_retweeters_account_age'] != -1]['avg_retweeters_account_age'].mean()

    if pd.isna(avg_retweeters_followers):
        avg_retweeters_followers = 0

    if pd.isna(avg_retweeters_time):
        avg_retweeters_time = 0

    if pd.isna(avg_retweeters_account_age):
        avg_retweeters_account_age = 0

    topic_values['Average Retweeters Followers'] = int(avg_retweeters_followers)
    topic_values['Average Retweets Time'] = int(avg_retweeters_time)
    topic_values['Average Retweeters Account Age'] = avg_retweeters_account_age


def get_timing_analysis(df, topic_values):
    df['first_half_time'] = [get_retweets_half_time(x, y) for x, y in zip(df['timestamp'], df['tweet_id'])]
    df['total_time'] = [get_retweets_total_time(x, y) for x, y in zip(df['timestamp'], df['tweet_id'])]
    df['first_half_avg_foll'] = [get_retweets_followers_split(x) for x in zip(df['tweet_id'])]
    df['second_half_avg_foll'] = [get_retweets_followers_second_split(x) for x in zip(df['tweet_id'])]

    avg_first_half_time = df[df['first_half_time'] != -1]['first_half_time'].mean()
    avg_total_time = df[df['total_time'] != -1]['total_time'].mean()
    avg_first_half_avg_foll = df[df['first_half_avg_foll'] != -1]['first_half_avg_foll'].mean()
    avg_second_half_avg_foll = df[df['second_half_avg_foll'] != -1]['second_half_avg_foll'].mean()

    if  pd.isna(avg_first_half_time):
        avg_first_half_time = 0

    if pd.isna(avg_total_time):
        avg_total_time = 0

    if pd.isna(avg_first_half_avg_foll):
        avg_first_half_avg_foll = 0

    if pd.isna(avg_second_half_avg_foll):
        avg_second_half_avg_foll = 0

    topic_values['Average Retweeters Half Time'] = avg_first_half_time
    topic_values['Average Retweeters Total Time'] = avg_total_time
    if avg_first_half_time != 0 and avg_total_time != 0:
        topic_values['% Time to get 50% retweets'] = (np.round(avg_first_half_time / avg_total_time, 2)) * 100
    else:
        topic_values['% Time to get 50% retweets'] = 0
    topic_values['Average Retweeters Followers First Half'] = avg_first_half_avg_foll
    topic_values['Average Retweeters Followers Second Half'] = avg_second_half_avg_foll

In [None]:
def retweeters_info_chart(df, x_col_categories, x_col, y_col, title, offline):
    fig = px.bar(df, x=x_col, y=y_col, color="Year", color_discrete_sequence=palette, barmode="group",
                 category_orders={x_col: x_col_categories,
                              'year': df['Year'].unique()})
    fig.update_layout(title=title)
    fig.show()
    if offline:
        plotly.offline.plot(fig, filename='../../data/charts/' + title + '.html')

In [None]:
def analysis_chart(df, x_col, y_bar, y_line, x_name, y_bar_name, y_line_name, title, offline):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    years = np.sort(df['Year'].unique())
    years_count = len(df['Year'].unique())

    for i in range(years_count):
        year = years[i]
        dfy = df[df['Year'] == year]
        fig.add_trace(go.Bar(x=dfy[x_col], y=dfy[y_bar], text=list(map(str, dfy[y_bar].tolist())), name=y_bar_name + ' ' + str(year), marker_color=palette[i], width=0.28, textposition='inside'), secondary_y=False)
        fig.add_trace(go.Scatter(x=dfy[x_col], y=dfy[y_line], name=y_line_name + ' ' + str(year), marker_color=palette[i+years_count]), secondary_y=True)

    fig.update_yaxes(title_text=y_line_name, secondary_y=True)
    fig.update_yaxes(title_text=y_bar_name, secondary_y=False)
    fig.update_layout(title_text=title, width=1100, height=500)
    fig.update_xaxes(title_text=x_name)
    fig.show()
    if offline:
        plotly.offline.plot(fig, filename='../../data/charts/' + title + '.html')

In [120]:
df_retweeters_chars = retweeters_characteristics(0)
df_retweeters_chars

avg_retweeters_followers: year 2019 topic: Holiday
avg_retweeters_followers: year 2019 topic: Holiday
avg_retweeters_followers: year 2019 topic: Holiday
avg_first_half_time: year 2019 topic: Holiday
avg_total_time: year 2019 topic: Holiday
avg_first_half_avg_foll: year 2019 topic: Holiday
avg_second_half_avg_foll: year 2019 topic: Holiday


Unnamed: 0,Topic,Average Retweeters Followers,Average Retweets Time,Average Retweeters Account Age,Year
0,Other,6338,139,8.420843,2019
1,Person,7644,177,8.385796,2019
2,TV and Movies,9105,76,8.709884,2019
3,Entities,4797,117,8.412541,2019
4,Interest and Hobbies,4842,143,8.254813,2019
5,Sport,4974,96,8.282345,2019
6,Brand,5996,214,8.345952,2019
7,Music,4646,253,8.280334,2019
8,Video Game,4796,266,7.325862,2019
9,Political,10669,149,8.578509,2019


### Analysing average followers count of retweeters per topic

In [99]:
retweeters_info_chart(df_retweeters_chars, topics_categories, "Topic", 'Average Retweeters Followers', "Analysing average followers count of retweeters per topic", offline_charts)

### Analysing average retweet time ratio of retweeters per topic

In [100]:
retweeters_info_chart(df_retweeters_chars, topics_categories, "Topic", "Average Retweets Time", "Analysing average retweet time ratio of retweeters per topic", offline_charts)

### Analysing average retweeter's account age per topic

In [101]:
retweeters_info_chart(df_retweeters_chars, topics_categories, "Topic", "Average Retweeters Account Age", "Analysing average retweeters account age per topic", offline_charts)

### Analysing influencers impact on tweet sharing in popular tweets

In [156]:
df[df['retweet_count'] > 10].shape[0] / df[df['retweet_count'] > 0].shape[0]

0.04608215921300283

In [182]:
df_retweeters_chars_10 = retweeters_characteristics(10)
df_retweeters_chars_10

Unnamed: 0,Topic,Average Retweeters Followers,Average Retweets Time,Average Retweeters Account Age,Average Retweeters Half Time,Average Retweeters Total Time,% Time to get 50% retweets,Average Retweeters Followers First Half,Average Retweeters Followers Second Half,Year
0,Brand,5900,87,8.234376,23.912603,979.146191,2.0,8397.003758,6161.324954,2019
1,Holiday,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2019
2,Person,9775,138,8.131308,119.471268,494.89523,24.0,16495.659918,11405.933321,2019
3,Interest and Hobbies,4940,39,8.093165,15.103257,363.839553,4.0,8041.262666,6349.449834,2019
4,Sport,4391,162,8.521556,4.961133,1373.115017,0.0,7691.170808,4904.560849,2019
5,TV and Movies,7569,40,8.598237,22.821898,402.81061,6.0,13423.078252,8620.562871,2019
6,Other,6588,85,8.015335,21.085873,872.309358,2.0,9932.26087,7700.045102,2019
7,Video Game,6969,8,7.489973,3.889611,65.510667,6.0,13191.224026,7452.611818,2019
8,Entities,3702,44,8.262815,15.080352,360.926867,4.0,4927.927025,4442.669838,2019
9,Political,5605,189,8.726545,118.089786,1321.691966,9.0,4810.937996,4931.441696,2019


In [183]:
fig = px.bar(df_retweeters_chars_10, x="Topic", y='% Time to get 50% retweets', color="Year", barmode="group", color_discrete_sequence=palette,
             category_orders={"Topic": topics_categories,
                              'year': df['year'].unique()})
fig.show()

In [184]:
fig = make_subplots(1, 3)
for i in range(1, 4):
    fig.add_trace(go.Bar(x=topics_categories, y=df_retweeters_chars_10['Average Retweeters Followers First Half']), 1, i)
    fig.add_trace(go.Bar(x=topics_categories, y=df_retweeters_chars_10['Average Retweeters Followers Second Half']), 1, i)
fig.update_xaxes(matches='x')
fig.show()

In [192]:
analysis_chart(df_retweeters_chars_10, "Topic", 'Average Retweeters Followers First Half', 'Average Retweeters Followers Second Half', "Topics", 'Average Retweeters Followers First Half', 'Average Retweeters Followers Second Half', "Average retweeters followers count split by median", offline_charts)