In [227]:
import math
import numpy as np
import pandas as pd
from itertools import cycle
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime
from sklearn.metrics import mean_absolute_error
import plotly.express as px
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import plotly

In [228]:
# Data Import and overview
DATASET_PATH = "../../data/tweets_2020_2021_v2.csv"
df = pd.read_csv(filepath_or_buffer=DATASET_PATH, sep=",")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['timestamp'] = [i.replace(tzinfo=datetime.timezone.utc) for i in df['timestamp']]
df = df.sort_values(by='timestamp', ascending=True)
df.tail()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,tweet_id,text,timestamp,user_id,like_count,retweet_count,...,day_phase,week_idx,day_phase_enc,day_of_week_enc,month_enc,year_enc,sentiment_enc,verified_enc,seniority,topics_cleaned
612261,612261,612261,612261,612261,1477036834261770241,…. them crawfish was so good yall lmfao,2021-12-31 21:59:57+00:00,795707439219884037,0,0,...,Night,2021-52,4,0,2,1,2,0,5,
98252,98252,98252,98252,98252,1477036835603836932,Damn all the freaks are at the New Bev right n...,2021-12-31 21:59:58+00:00,34679503,26,0,...,Night,2021-52,4,0,2,1,1,0,13,
1072384,1072384,1072384,1072384,1072384,1477036838217035777,Fast And Professional Vehicle Lockout Service ...,2021-12-31 21:59:58+00:00,210241982,0,0,...,Night,2021-52,4,0,2,1,0,0,11,
1072385,1072385,1072385,1072385,1072385,1477036840024567809,Very much this https://t.co/b3bgtstLfr,2021-12-31 21:59:59+00:00,883576549,3,0,...,Night,2021-52,4,0,2,1,1,0,9,
1072386,1072386,1072386,1072386,1072386,1477036841933185025,Best thread ever https://t.co/EBo4JkTCx4,2021-12-31 21:59:59+00:00,131371694,0,0,...,Night,2021-52,4,0,2,1,2,0,12,


In [229]:
variables_to_keep = ['followers', 'following', 'tweet_count', 'seniority', 'verified_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'topics_ids', 'sentiment_enc', 'timestamp', 'retweet_count']
variables_to_predict = ['followers', 'following', 'tweet_count', 'seniority', 'verified_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'topics_ids', 'sentiment_enc', 'timestamp', 'retweet_count', 'popularity', 'year']

In [230]:
df = df[(~df['topics'].isnull()) & (df['topics_cleaned'] == 'Person')]
df = df[variables_to_predict]

In [231]:
df_2020 = df[df['year'] == 2020]
df_2021 = df[df['year'] == 2021]

In [232]:
df_2020.shape

(48349, 14)

In [233]:
df_2021.shape

(31801, 14)

# Machine Learning

### Predicting tweet popularity in 2021 using 2020 data with topics

## Data spliting

In [234]:
num_folds = 7
seed = 7
scoring = 'accuracy'
validation_size = 0.70

In [235]:
df_2020

Unnamed: 0,followers,following,tweet_count,seniority,verified_enc,day_phase_enc,day_of_week_enc,month_enc,topics_ids,sentiment_enc,timestamp,retweet_count,popularity,year
35,653,732,33826,6,0,1,6,4,10,0,2020-01-01 00:59:15+00:00,1,1,2020
41,26108,20584,83121,9,0,1,6,4,10,2,2020-01-01 00:59:15+00:00,0,0,2020
92,2070,2932,102563,12,0,1,6,4,10,2,2020-01-01 00:59:17+00:00,0,0,2020
103,477,737,15717,11,0,1,6,4,10,0,2020-01-01 00:59:17+00:00,0,0,2020
1753,2372,2124,148383,11,0,1,6,4,10,2,2020-01-01 00:59:22+00:00,0,0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522751,1993,3349,29050,4,0,4,4,2,10,2,2020-12-31 21:59:50+00:00,0,0,2020
68748,267,401,25757,6,0,4,4,2,10,0,2020-12-31 21:59:53+00:00,0,0,2020
737164,822,261,39834,7,0,4,4,2,10,0,2020-12-31 21:59:53+00:00,0,0,2020
505196,294,120,22251,4,0,4,4,2,10,2,2020-12-31 21:59:53+00:00,0,0,2020


### Only data from selected topic

In [236]:
topic_performance_2020 = df_2020[variables_to_keep].resample('D', on='timestamp').mean()
topic_performance_2020

Unnamed: 0_level_0,followers,following,tweet_count,seniority,verified_enc,day_phase_enc,day_of_week_enc,month_enc,topics_ids,sentiment_enc,retweet_count
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-01 00:00:00+00:00,3592.684211,1980.960526,38844.802632,8.763158,0.039474,1.486842,6.0,4.0,10.0,1.197368,9.539474
2020-01-02 00:00:00+00:00,5823.604396,1921.208791,51005.384615,8.857143,0.032967,2.010989,4.0,4.0,10.0,1.384615,0.483516
2020-01-03 00:00:00+00:00,8985.808000,1539.608000,39222.360000,8.680000,0.040000,2.016000,0.0,4.0,10.0,0.904000,0.528000
2020-01-04 00:00:00+00:00,3480.369565,2265.652174,56841.021739,9.119565,0.032609,1.652174,2.0,4.0,10.0,1.097826,0.760870
2020-01-05 00:00:00+00:00,3007.260870,2804.934783,49752.010870,9.076087,0.010870,2.086957,3.0,4.0,10.0,0.978261,0.858696
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-27 00:00:00+00:00,11939.162162,2716.594595,42720.675676,8.837838,0.040541,2.013514,3.0,2.0,10.0,1.108108,0.162162
2020-12-28 00:00:00+00:00,3527.244681,2320.234043,67314.393617,8.510638,0.042553,2.021277,1.0,2.0,10.0,1.148936,1.393617
2020-12-29 00:00:00+00:00,6804.795276,1938.133858,52752.771654,8.251969,0.062992,2.070866,5.0,2.0,10.0,1.039370,1.031496
2020-12-30 00:00:00+00:00,4109.027273,2044.700000,47877.227273,8.272727,0.045455,1.990909,6.0,2.0,10.0,1.127273,0.363636


In [237]:
topic_performance_2021 = df_2021[variables_to_keep].resample('D', on='timestamp').mean()
topic_performance_2021

Unnamed: 0_level_0,followers,following,tweet_count,seniority,verified_enc,day_phase_enc,day_of_week_enc,month_enc,topics_ids,sentiment_enc,retweet_count
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00+00:00,3395.384615,2900.615385,33079.446154,7.138462,0.046154,1.984615,0.0,4.0,10.0,1.092308,0.476923
2021-01-02 00:00:00+00:00,2043.580645,1757.666667,47857.537634,8.129032,0.043011,1.827957,2.0,4.0,10.0,1.204301,1.043011
2021-01-03 00:00:00+00:00,2076.752000,2079.456000,74253.472000,8.776000,0.016000,2.328000,3.0,4.0,10.0,1.120000,0.400000
2021-01-04 00:00:00+00:00,5803.210084,2826.512605,49427.084034,8.974790,0.033613,1.983193,1.0,4.0,10.0,1.159664,8.436975
2021-01-05 00:00:00+00:00,2514.198413,2185.119048,44411.126984,8.349206,0.015873,2.277778,5.0,4.0,10.0,1.150794,0.365079
...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27 00:00:00+00:00,2842.238095,1692.730159,90983.841270,8.539683,0.047619,2.079365,1.0,2.0,10.0,1.142857,0.253968
2021-12-28 00:00:00+00:00,3839.534247,2171.082192,55867.397260,7.150685,0.041096,2.054795,5.0,2.0,10.0,1.054795,0.561644
2021-12-29 00:00:00+00:00,3352.465753,1841.931507,38750.547945,7.698630,0.054795,2.219178,6.0,2.0,10.0,1.369863,0.602740
2021-12-30 00:00:00+00:00,2082.090909,2174.506494,62405.415584,8.584416,0.000000,2.038961,4.0,2.0,10.0,1.194805,1.389610


In [238]:
X_train = topic_performance_2020.drop('retweet_count', axis=1)
y_train = topic_performance_2020['retweet_count']
print(X_train.shape)
print(y_train.shape)

(366, 10)
(366,)


In [239]:
X_test = topic_performance_2021.drop('retweet_count', axis=1)
y_test = topic_performance_2021['retweet_count']
print(X_test.shape)
print(y_test.shape)

(365, 10)
(365,)


In [249]:
def train_time_series_with_folds(X_train, X_test, y_train, y_test):
    #create, train and do inference of the model
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    #calculate MAE
    mae = np.round(mean_absolute_error(y_test, predictions), 3)

    res = []
    for i in range(365):
        d = dict()
        d['timestamp'] = topic_performance_2021.index[i]
        d['type'] = 'real'
        d['value'] = y_test.iloc[i]
        res.append(d)

        d = dict()
        d['timestamp'] = topic_performance_2021.index[i]
        d['type'] = 'prediction'
        d['value'] = predictions[i]
        res.append(d)

    df_results = pd.DataFrame(res)

    fig = px.line(df_results, x="timestamp", y="value", color='type', title=("Predictions of topic retweet count average for 2021 with MAE:" + str(mae)),
             color_discrete_sequence=px.colors.qualitative.Safe, width=900, height=500)
    #fig.add_trace()
    fig.update_xaxes(title_text="Time")
    fig.update_yaxes(title_text="Average retweet count")
    fig.show()
    plotly.offline.plot(fig, filename='../../data/charts/Predictions of topic retweet count average for 2021 with MAE.html')


    #create a dataframe with the variable importances of the model
    df_importances = pd.DataFrame({
        'feature': model.feature_names_in_,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    fig = px.bar(df_importances, x="feature", y="importance", title=("Variable Importances"),
             color_discrete_sequence=px.colors.qualitative.Safe, width=900, height=500)
    fig.show()

In [250]:
train_time_series_with_folds(X_train, X_test, y_train, y_test)

In [245]:
def train_time_series_with_folds_with_standartization(X_train, X_test, y_train, y_test):

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    #create, train and do inference of the model
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train_scaled, y_train)

    predictions = model.predict(X_test_scaled)

    #calculate MAE
    mae = np.round(mean_absolute_error(y_test, predictions), 3)

    res = []
    for i in range(365):
        d = dict()
        d['timestamp'] = topic_performance_2021.index[i]
        d['type'] = 'real'
        d['value'] = y_test.iloc[i]
        res.append(d)

        d = dict()
        d['timestamp'] = topic_performance_2021.index[i]
        d['type'] = 'prediction'
        d['value'] = predictions[i]
        res.append(d)

    df_results = pd.DataFrame(res)

    fig = px.line(df_results, x="timestamp", y="value", color='type', title=("Predictions of topic retweet count average for 2021 with MAE:" + str(mae)),
             color_discrete_sequence=px.colors.qualitative.Safe, width=900, height=500)
    #fig.add_trace()
    fig.update_xaxes(title_text="Time")
    fig.update_yaxes(title_text="Average retweet count")
    fig.show()
    plotly.offline.plot(fig, filename='../../data/charts/Predictions of topic retweet count average for 2021 with MAE.html')


    #create a dataframe with the variable importances of the model
    df_importances = pd.DataFrame({
        'feature': model.feature_names_in_,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    fig = px.bar(df_importances, x="feature", y="importance", title=("Variable Importances"),
             color_discrete_sequence=px.colors.qualitative.Safe, width=900, height=500)
    fig.show()

In [251]:
train_time_series_with_folds_with_standartization(X_train, X_test, y_train, y_test)

TypeError: 'NoneType' object is not callable