In [1]:
from Twitter.analysis.data_analysis import *
from os import walk

# Data Import and overview

In [2]:
DATASETS_PATH = "../../data/processed_tweets/"

processed_filenames = next(walk(DATASETS_PATH), (None, None, []))[2]
filenames = [DATASETS_PATH + "/" + filename for filename in processed_filenames]

In [3]:
df = ensemble_dataset(filenames)

## Data characteristics

In [4]:
get_data_characteristics(df)


Data characteristics
Average nº of tweets per day of the week: 227982
Nº of tweets with known topics 557315 and are 34.92% of the data
Nº of tweets do collect retweeters information: 64878
16.45% have retweets



In [5]:
df['year'].value_counts()

2021    533375
2019    531272
2020    531227
Name: year, dtype: int64

## Tweets performance

In [6]:
topics_categories = df['topics_cleaned'].unique()[1:]
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
week_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_phases = ['Morning', 'Afternoon', 'Dusk', 'Night', 'Middle of the night']
sentiments = ['Negative', 'Neutral', 'Positive']
hashtags = [True, False]
offline_graphs = False

In [7]:
tweet_analysis = df[['text', 'year', 'day_phase', 'day_of_week', 'month', 'retweet_count', 'quote_count', 'like_count', 'reply_count', 'sentiment', 'hashtags', 'topics_cleaned', 'reach']]

### Average retweet and like count during the day

In [8]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_phase'], day_phases)
df_analysis

Unnamed: 0,year,day_phase,count day_phase,% day_phase,% with retweets,% with likes,retweets mean,likes mean
0,2019,Morning,105912,19.935551,14.37,44.76,0.41,2.45
1,2019,Afternoon,106273,20.003501,17.78,48.18,0.56,3.4
2,2019,Dusk,106415,20.030229,18.12,50.0,0.56,3.62
3,2019,Night,106312,20.010842,17.73,49.82,0.54,3.64
4,2019,Middle of the night,106360,20.019877,17.32,50.74,0.52,3.68
0,2020,Morning,105633,19.88472,13.95,45.36,0.41,2.63
1,2020,Afternoon,106420,20.032867,16.94,47.8,0.54,3.67
2,2020,Dusk,106397,20.028538,16.95,49.96,0.53,3.84
3,2020,Night,106408,20.030608,16.73,50.41,0.51,3.91
4,2020,Middle of the night,106369,20.023267,16.44,51.16,0.48,3.88


In [9]:
get_correlations_2_vars(df_analysis, 'retweets mean', 'likes mean')

Correlation between retweets mean and likes mean:
pearson=0.69
spearman=0.39
kendall=0.20



In [None]:
analysis_chart(df_analysis, 'day_phase', '% with retweets', '% with likes', 'Day phase', '% with retweets', '% with likes', 'Percentage of retweets and likes during the day', offline_graphs)
analysis_chart(df_analysis, 'day_phase', 'retweets mean', 'likes mean', 'Day phase', 'Retweets mean', 'Likes mean', 'Average retweets and likes during the day', offline_graphs)

### Average retweet and like count during the week

In [10]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_of_week'], week_days)
df_analysis

Unnamed: 0,year,day_of_week,count day_of_week,% day_of_week,% with retweets,% with likes,retweets mean,likes mean
0,2019,Monday,75745,14.257292,17.17,48.71,0.53,3.33
1,2019,Tuesday,77311,14.552056,17.41,48.8,0.53,3.3
2,2019,Wednesday,75810,14.269527,17.65,49.1,0.55,3.4
3,2019,Thursday,75831,14.273479,17.23,48.65,0.55,3.35
4,2019,Friday,74646,14.05043,17.12,48.64,0.5,3.3
5,2019,Saturday,76095,14.323172,16.62,48.85,0.49,3.44
6,2019,Sunday,75834,14.274044,16.24,48.16,0.49,3.39
0,2020,Monday,75439,14.200897,16.39,48.86,0.52,3.6
1,2020,Tuesday,75374,14.188661,16.45,48.44,0.49,3.5
2,2020,Wednesday,76868,14.469897,16.27,48.87,0.49,3.49


In [11]:
get_correlations_2_vars(df_analysis, 'retweets mean', 'likes mean')

Correlation between retweets mean and likes mean:
pearson=-0.04
spearman=-0.19
kendall=-0.11



In [11]:
analysis_chart(df_analysis, 'day_of_week', '% with retweets', '% with likes', 'Weekday', '% with retweets',
               '% with likes', 'Percentage of retweets and likes during the week', offline_graphs)
analysis_chart(df_analysis, 'day_of_week', 'retweets mean', 'likes mean', 'Weekday', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the week', offline_graphs)

### Average retweet count per month

In [13]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['month'], months)
df_analysis

Unnamed: 0,year,month,count month,% month,% with retweets,% with likes,retweets mean,likes mean
0,2019,January,45257,8.518612,16.24,46.04,0.49,2.9
1,2019,February,40908,7.700011,16.79,48.17,0.48,2.98
2,2019,March,45287,8.524259,16.83,47.29,0.51,3.3
3,2019,April,43801,8.244553,17.28,48.78,0.54,3.38
4,2019,May,44208,8.321161,17.23,48.57,0.53,3.41
5,2019,June,43778,8.240223,16.89,48.14,0.5,3.19
6,2019,July,45211,8.509953,16.27,47.02,0.47,3.09
7,2019,August,45179,8.50393,17.09,49.7,0.51,3.43
8,2019,September,43741,8.233259,17.53,49.9,0.54,3.53
9,2019,October,45060,8.481531,17.69,50.22,0.55,3.6


In [14]:
get_correlations_2_vars(df_analysis, 'retweets mean', 'likes mean')

Correlation between retweets mean and likes mean:
pearson=0.43
spearman=0.45
kendall=0.33



In [13]:
analysis_chart(df_analysis, 'month', '% with retweets', '% with likes', 'Month', '% with retweets',
               '% with likes', 'Percentage of retweets and likes during the year', offline_graphs)
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Month', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the year', offline_graphs)

### Tweets performance by sentiment

In [15]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['sentiment'], sentiments)
df_analysis

Unnamed: 0,year,sentiment,count sentiment,% sentiment,% with retweets,% with likes,retweets mean,likes mean
0,2019,Negative,125989,23.714594,16.18,46.8,0.52,2.9
1,2019,Neutral,163721,30.816794,15.18,44.15,0.42,2.8
2,2019,Positive,241562,45.468611,18.8,52.78,0.59,3.98
0,2020,Negative,131147,24.687563,15.74,48.13,0.5,3.15
1,2020,Neutral,165832,31.216787,14.23,43.83,0.4,2.98
2,2020,Positive,234248,44.09565,17.86,53.02,0.55,4.26
0,2021,Negative,121368,22.754722,15.23,48.73,0.5,3.83
1,2021,Neutral,181050,33.944223,13.51,43.54,0.4,3.73
2,2021,Positive,230957,43.301055,18.52,55.59,0.6,5.48


In [16]:
get_correlations_2_vars(df_analysis, 'retweets mean', 'likes mean')

Correlation between retweets mean and likes mean:
pearson=0.68
spearman=0.67
kendall=0.51



In [15]:
analysis_chart(df_analysis, 'sentiment', '% with retweets', '% with likes', 'Sentiment', 'Retweet count',
               'Likes count', 'Percentage of retweets and likes by sentiment', offline_graphs)
analysis_chart(df_analysis, 'sentiment', 'retweets mean', 'likes mean', 'Sentiment', 'Retweets mean', 'Likes mean',
               'Average retweets and likes number by sentiment', offline_graphs)

### Tweets performance by topics

#### Performance of each topic in retweets and likes

In [17]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['topics_cleaned'], topics_categories)
df_analysis

Unnamed: 0,year,topics_cleaned,count topics_cleaned,% topics_cleaned,% with retweets,% with likes,retweets mean,likes mean
0,2019,Other,32323.0,16.662457,16.14,46.38,0.52,3.3
1,2019,Person,36681.0,18.908999,15.45,44.01,0.58,3.3
2,2019,TV and Movies,34474.0,17.771294,15.33,51.57,0.47,3.91
3,2019,Entities,20770.0,10.706903,15.13,50.12,0.43,3.21
4,2019,Interest and Hobbies,26951.0,13.893199,15.07,48.49,0.42,3.15
5,2019,Sport,7326.0,3.776542,22.17,59.1,0.66,5.54
6,2019,Brand,29804.0,15.363916,16.23,48.83,0.5,3.17
7,2019,Music,2036.0,1.049555,19.5,52.9,0.47,2.94
8,2019,Video Game,1985.0,1.023264,16.07,55.01,0.4,3.83
9,2019,Political,1428.0,0.736132,14.99,31.86,1.05,2.02


In [18]:
get_correlations_2_vars(df_analysis, 'retweets mean', 'likes mean')

Correlation between retweets mean and likes mean:
pearson=-0.25
spearman=0.22
kendall=0.15



In [17]:
analysis_chart(df_analysis, 'topics_cleaned', '% with retweets', '% with likes', 'Topics', 'Retweet count',
               'Likes count', 'Percentage of retweets and likes by topic', offline_graphs)
analysis_chart(df_analysis, 'topics_cleaned', 'retweets mean', 'likes mean', 'Topics', 'Retweets mean', 'Likes mean',
               'Average tweets performance by topic', offline_graphs)

#### Average tweet reach per topic

In [183]:
df_analysis = reach_by_topic(tweet_analysis, ['topics_cleaned'], topics_categories)
df_analysis

Unnamed: 0,year,topics_cleaned,count topics_cleaned,% topics_cleaned,reach mean
0,2019,Other,32323.0,16.662457,4423.33
1,2019,Person,36681.0,18.908999,4537.81
2,2019,TV and Movies,34474.0,17.771294,4498.5
3,2019,Entities,20770.0,10.706903,3034.35
4,2019,Interest and Hobbies,26951.0,13.893199,2992.07
5,2019,Sport,7326.0,3.776542,4603.02
6,2019,Brand,29804.0,15.363916,3822.88
7,2019,Music,2036.0,1.049555,3370.83
8,2019,Video Game,1985.0,1.023264,3029.05
9,2019,Political,1428.0,0.736132,4828.05


In [189]:
multi_label_chart_v2(df_analysis, 'year', df_analysis['year'].unique(), 'year', 'reach mean', "Year", 'Reach mean', "topics_cleaned", 5, "group", 0.32,  'Average tweet reach by topic', offline_graphs)

In [19]:
multi_label_chart(df_analysis, 'topics_cleaned', topics_categories, 'topics_cleaned', 'reach mean', 'Topics', 'Reach mean', 'Average tweet reach by topic', offline_graphs)

#### Average retweet count per topic during the day

In [142]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_phase', 'topics_cleaned'], day_phases)
df_analysis.head(60)

Unnamed: 0,year,day_phase,topics_cleaned,count day_phase,% day_phase,% with retweets,% with likes,retweets mean,likes mean
0,2019,Afternoon,Book,32,0.016496,37.5,59.38,1.12,8.09
1,2019,Afternoon,Brand,6028,3.107425,17.47,48.59,0.49,3.13
2,2019,Afternoon,Entities,4331,2.232624,16.37,50.13,0.5,3.28
3,2019,Afternoon,Interest and Hobbies,5915,3.049173,16.04,48.45,0.49,3.34
4,2019,Afternoon,Music,383,0.197436,21.41,48.56,0.48,2.92
5,2019,Afternoon,Other,7761,4.000784,17.6,46.4,0.51,3.27
6,2019,Afternoon,Person,8588,4.427101,15.8,42.23,0.55,3.27
7,2019,Afternoon,Political,395,0.203622,14.18,30.89,1.06,2.77
8,2019,Afternoon,Sport,1335,0.68819,26.59,62.32,0.85,7.02
9,2019,Afternoon,TV and Movies,5238,2.700181,18.0,53.53,0.6,4.38


In [167]:
multi_label_chart_v2(df_analysis, 'year', day_phases, 'day_phase', '% with retweets', "Day phase", "% with retweets", "topics_cleaned", 5, "group", 0.45,  "Percentage of retweets by topic during the day", offline_graphs)

In [29]:
multi_label_chart(df_analysis, "topics_cleaned", day_phases, "day_phase", "% with retweets", "Day phase", "% with retweets", "Percentage of retweets by topic during the day", offline_graphs)

#### Average retweet count per topic during the week

In [181]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['day_of_week', 'topics_cleaned'], week_days)
df_analysis

Unnamed: 0,year,day_of_week,topics_cleaned,count day_of_week,% day_of_week,% with retweets,% with likes,retweets mean,likes mean
0,2019,Friday,Book,30,0.015465,20.00,56.67,0.20,2.77
1,2019,Friday,Brand,4227,2.179012,16.63,48.78,0.50,3.23
2,2019,Friday,Entities,2987,1.539794,15.03,49.15,0.40,3.21
3,2019,Friday,Interest and Hobbies,3863,1.991371,13.72,47.04,0.40,2.96
4,2019,Friday,Music,347,0.178878,15.56,49.28,0.26,1.88
...,...,...,...,...,...,...,...,...,...
86,2021,Wednesday,Person,4851,2.786202,16.47,48.34,0.62,4.32
87,2021,Wednesday,Political,243,0.139569,20.99,44.86,1.40,4.97
88,2021,Wednesday,Sport,1083,0.622028,25.58,63.80,0.79,7.53
89,2021,Wednesday,TV and Movies,5374,3.086590,15.41,54.82,0.52,5.67


In [182]:
multi_label_chart_v2(df_analysis, 'year', week_days, "day_of_week", '% with retweets', "Weekdays", "% with retweets", "topics_cleaned", 4, "group", 0.32, "Percentage of retweets by topic during the week", offline_graphs)

In [31]:
multi_label_chart(df_analysis, "topics_cleaned", week_days, "day_of_week", "% with retweets", "Weekday", "% with retweets", "Percentage of retweets by topic during the week", offline_graphs)

#### Average retweet count per topic during the year

In [177]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['month', 'topics_cleaned'], months)
df_analysis

Unnamed: 0,year,month,topics_cleaned,count month,% month,% with retweets,% with likes,retweets mean,likes mean
0,2019,April,Book,24,0.012372,37.50,58.33,1.96,8.12
1,2019,April,Brand,2303,1.187193,15.59,47.37,0.41,3.08
2,2019,April,Entities,1490,0.768093,15.64,49.93,0.36,2.85
3,2019,April,Interest and Hobbies,2183,1.125333,16.31,47.82,0.42,3.06
4,2019,April,Music,174,0.089697,24.14,55.75,0.53,3.22
...,...,...,...,...,...,...,...,...,...
151,2021,September,Person,2535,1.455993,17.16,49.31,0.58,4.36
152,2021,September,Political,118,0.067774,22.03,42.37,0.82,2.72
153,2021,September,Sport,545,0.313024,24.22,59.27,0.57,5.83
154,2021,September,TV and Movies,4022,2.310060,15.61,54.18,0.49,5.74


In [180]:
multi_label_chart_v2(df_analysis, 'year', months, "month", '% with retweets', "Months", "% with retweets", "topics_cleaned", 4, "group", 0.32, "Percentage of retweets by topic during the year", offline_graphs)

In [33]:
multi_label_chart(df_analysis, "topics_cleaned", months, "month", "% with retweets", "Months", "% with retweets", "Percentage of retweets by topic during the year", offline_graphs)

#### Impact of hashtags in topic popularity

In [173]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['hashtags', 'topics_cleaned'], hashtags)
df_analysis

Unnamed: 0,year,hashtags,topics_cleaned,count hashtags,% hashtags,% with retweets,% with likes,retweets mean,likes mean
0,2019,False,Book,131,0.067530,24.43,58.78,0.68,6.40
1,2019,False,Brand,23448,12.087408,15.09,49.17,0.46,3.09
2,2019,False,Entities,15986,8.240758,14.14,50.78,0.39,3.14
3,2019,False,Interest and Hobbies,18669,9.623841,14.95,52.38,0.43,3.42
4,2019,False,Music,1394,0.718605,19.15,54.23,0.50,3.49
...,...,...,...,...,...,...,...,...,...
21,2021,True,Person,4843,2.781607,24.45,52.84,0.98,5.23
22,2021,True,Political,250,0.143589,29.20,46.80,1.22,4.33
23,2021,True,Sport,2896,1.663335,26.66,63.36,0.92,8.05
24,2021,True,TV and Movies,14448,8.298298,18.77,58.02,0.59,6.29


In [176]:
multi_label_chart_v2(df_analysis, 'year', hashtags, "hashtags", '% with retweets', "Hashtags", "% with retweets", "topics_cleaned", 5, "group", 0.32, "Hashtags presence by topic and corresponding % retweet count", offline_graphs)

In [35]:
multi_label_chart(df_analysis, "topics_cleaned", topics_categories, "hashtags", "% with retweets", "hashtags", "% with retweets", "Hashtags presence by topic and corresponding % retweet count", offline_graphs)

#### Tweet sentiment per topic

In [168]:
df_analysis = retweets_likes_info_by_year(tweet_analysis, ['sentiment', 'topics_cleaned'], sentiments)
df_analysis

Unnamed: 0,year,sentiment,topics_cleaned,count sentiment,% sentiment,% with retweets,% with likes,retweets mean,likes mean
0,2019,Negative,Book,44,0.022682,15.91,47.73,0.25,1.86
1,2019,Negative,Brand,6595,3.399712,15.71,49.78,0.64,3.09
2,2019,Negative,Entities,4063,2.094470,15.46,50.11,0.58,3.62
3,2019,Negative,Interest and Hobbies,5987,3.086289,14.26,44.95,0.42,2.84
4,2019,Negative,Music,388,0.200013,19.07,49.74,0.52,3.17
...,...,...,...,...,...,...,...,...,...
34,2021,Positive,Person,13507,7.757828,17.54,52.06,0.63,5.02
35,2021,Positive,Political,564,0.323937,23.94,51.06,1.41,6.21
36,2021,Positive,Sport,4250,2.441014,27.13,69.20,0.80,8.21
37,2021,Positive,TV and Movies,18928,10.871413,17.23,58.05,0.52,6.30


In [172]:
multi_label_chart_v2(df_analysis, 'year', sentiments, "sentiment", '% with retweets', "Sentiment", "% with retweets", "topics_cleaned", 5, "group", 0.32, "Tweet sentiment by topic and corresponding % retweet count", offline_graphs)

In [37]:
multi_label_chart(df_analysis, "topics_cleaned", topics_categories, "sentiment", "% with retweets", "sentiment", "% with retweets", "Tweet sentiment by topic and corresponding % retweet count", offline_graphs)

## Case study: Covid-19

In [None]:
tweets_2020, tweets_2020_covid = get_covid_datasets(tweet_analysis)

In [None]:
df_analysis = retweets_likes_info_by_year(tweets_2020, ['month'], months)
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Meses', 'Média de retweets',
               'Média de gostos', 'Média de retweets e gostos durante o ano de 2020', offline_graphs)

In [None]:
df_analysis = retweets_likes_info_by_year(tweets_2020_covid, ['month'], months)
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Meses', 'Média de retweets',
               'Média de gostos', 'Média de retweets e gostos em tweets sobre covid durante o ano de 2020',
               offline_graphs)

In [None]:
df_analysis = retweets_likes_info_by_year(tweets_2020, ['sentiment'], sentiments)
analysis_chart(df_analysis, 'sentiment', 'retweets mean', 'likes mean', 'Sentimentos', 'Média de retweets',
               'Média de gostos', 'Média de retweets e likes por sentimento em 2020', offline_graphs)

In [None]:
df_analysis = retweets_likes_info_by_year(tweets_2020_covid, ['sentiment'], sentiments)
analysis_chart(df_analysis, 'sentiment', 'retweets mean', 'likes mean', 'Sentimentos', 'Média de retweets',
               'Média de gostos', 'Média de retweets e likes por sentimento em tweets sobre covid em 2020',
               offline_graphs)

In [23]:
test_2020 = tweet_analysis[tweet_analysis['year'] == 2020].copy()
test_2021 = tweet_analysis[tweet_analysis['year'] == 2021].copy()

In [24]:
covid_keywords = ['Coronavirus', 'Corona', 'CDC', 'Ncov', 'Wuhan', 'Outbreak', 'China', 'Koronavirus', 'Wuhancoronavirus', 'Wuhanlockdown', 'N95', 'Kungflu', 'Epidemic', 'Sinophobia', 'Covid-19', 'Corona virus', 'Covid19', 'Sars-cov-2', 'COVID–19', 'COVD', 'Pandemic', 'Coronapocalypse', 'CancelEverything', 'Coronials', 'SocialDistancing', 'Panic buying', 'DuringMy14DayQuarantine', 'Panic shopping', 'InMyQuarantineSurvivalKit', 'chinese virus', 'stayhomechallenge', 'DontBeASpreader', 'lockdown', 'shelteringinplace', 'staysafestayhome', 'trumppandemic', 'flatten the curve', 'GetMePPE', 'covidiot', 'epitwitter', 'Pandemie']

In [27]:
def find_keywords_in_tweets(keywords, tweets):
    res = []
    for text in tweets:
        tweet_words = text.split(' ')
        intersections = [i for i in tweet_words if i in keywords]
        if len(intersections) > 0:
           res.append(1)
        else:
            res.append(0)
    return res

In [28]:
test_2020['covid_matches'] = find_keywords_in_tweets(covid_keywords, test_2020['text'])
test_2021['covid_matches'] = find_keywords_in_tweets(covid_keywords, test_2021['text'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
test_2020_covid = test_2020[test_2020['covid_matches'] == 1]
test_2021_covid = test_2021[test_2021['covid_matches'] == 1]

In [32]:
print(test_2020_covid.shape[0])
print(test_2021_covid.shape[0])

1422
605


In [36]:
test_2020_covid['month'].value_counts(normalize=True, sort=False)

January      0.020394
February     0.036568
March        0.287623
April        0.155415
May          0.113221
June         0.073840
July         0.079466
August       0.046414
September    0.059072
October      0.045007
November     0.037271
December     0.045710
Name: month, dtype: float64

In [37]:
test_2021_covid['month'].value_counts(normalize=True, sort=False)

January      0.105785
February     0.084298
March        0.074380
April        0.082645
May          0.112397
June         0.046281
July         0.094215
August       0.090909
September    0.047934
October      0.056198
November     0.059504
December     0.145455
Name: month, dtype: float64

In [44]:
df_analysis = retweets_likes_info_by_year(test_2020, ['month'], months)
df_analysis

Unnamed: 0,year,month,count month,% month,% with retweets,% with likes,retweets mean,likes mean
0,2020,January,45117,8.492979,17.5,50.44,0.55,3.89
1,2020,February,42280,7.958933,18.08,50.98,0.57,3.97
2,2020,March,45167,8.502392,17.67,51.43,0.55,3.68
3,2020,April,43725,8.230945,16.83,50.54,0.49,3.59
4,2020,May,45038,8.478108,16.9,50.39,0.52,3.55
5,2020,June,43479,8.184637,17.64,48.52,0.59,3.72
6,2020,July,44979,8.467002,16.48,48.59,0.52,3.63
7,2020,August,44893,8.450813,15.92,48.06,0.47,3.47
8,2020,September,43503,8.189155,15.6,46.81,0.45,3.38
9,2020,October,44961,8.463613,15.34,46.91,0.48,3.48


In [46]:
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Month', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the year', offline_graphs)

In [47]:
df_analysis = retweets_likes_info_by_year(test_2020_covid, ['month'], months)
df_analysis

Unnamed: 0,year,month,count month,% month,% with retweets,% with likes,retweets mean,likes mean
0,2020,January,29,2.039381,31.03,51.72,0.38,1.62
1,2020,February,52,3.656821,25.0,53.85,0.94,2.62
2,2020,March,409,28.762307,22.98,52.08,0.79,3.93
3,2020,April,221,15.541491,20.81,45.25,0.73,3.0
4,2020,May,161,11.322082,19.88,45.34,0.82,3.39
5,2020,June,105,7.383966,14.29,38.1,0.3,1.19
6,2020,July,113,7.946554,23.01,46.02,0.5,1.35
7,2020,August,66,4.64135,15.15,43.94,0.27,1.71
8,2020,September,84,5.907173,21.43,33.33,1.35,3.4
9,2020,October,64,4.500703,17.19,35.94,0.23,1.31


In [48]:
analysis_chart(df_analysis, 'month', 'retweets mean', 'likes mean', 'Month', 'Retweets mean', 'Likes mean',
               'Average retweets and likes during the year', offline_graphs)