In [12]:
import os, bz2, json, time
from datetime import timedelta
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.optimize import curve_fit

import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb

from math import ceil

In [2]:
if not os.path.exists('active-dataset.p'):
    print('>>> Converting ACTIVE dataset from JSON format to pickle... might take a while!')
    active_videos = {}
    with bz2.BZ2File('active-dataset.json.bz2') as f:
        dataset = json.loads(f.readline())
    pickle.dump(dataset, open('active-dataset.p', 'wb'))
    
print('>>> Loading the ACTIVE dataset from pickle...')
active_videos = pickle.load(open('active-dataset.p', 'rb'))
df = pd.DataFrame(active_videos)

>>> Loading the ACTIVE dataset from pickle...


In [3]:
df.columns

Index(['YoutubeID', 'numTweet', 'numShare', 'numSubscriber', 'watchTime',
       'dailyViewcount', 'description', 'title', 'channelId', 'channelTitle',
       'category', 'uploadDate', 'duration', 'definition', 'dimension',
       'caption', 'regionRestriction.blocked', 'regionRestriction.allowed',
       'topicIds', 'relevantTopicIds', 'totalShare', 'totalViewcount',
       'totalTweet', 'dailyTweets'],
      dtype='object')

In [4]:
df.shape

(14041, 24)

In [5]:
df.head()

Unnamed: 0,YoutubeID,numTweet,numShare,numSubscriber,watchTime,dailyViewcount,description,title,channelId,channelTitle,...,dimension,caption,regionRestriction.blocked,regionRestriction.allowed,topicIds,relevantTopicIds,totalShare,totalViewcount,totalTweet,dailyTweets
0,00-6OyXVA0M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[254, 1399, 493, 242, 175, 216, 372, 729, 305,...","[4, 19, 19, 7, 2, 5, 8, 17, 6, 3, 2, 1, 3, 1, ...","[59311.9833333, 455426.75, 206253.816667, 1119...","[90158, 695628, 312298, 170348, 82346, 61530, ...",Bill's response to Victoria Osteen's congregat...,Bill Cosby responds to Victoria Osteen,UCpzGw-b-rEPjRs2LT0ZVHTQ,Bryant Harris,...,2d,False,{},{},"[/m/03bxbhz, /m/03ny9x, /m/014zfs]",[/m/06bvp],5206,2174286,3857,"[2588, 455, 157, 100, 86, 76, 102, 64, 28, 23,..."
1,00ATf2HR-FA,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[12, 47, 14, 32, 32, 12, 5, 4, 3, 4, 1, 0, 3, ...","[2, 2, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2937.13333333, 9039.33333333, 5082.71666667, ...","[14939, 40664, 21261, 30914, 31073, 20395, 117...",Adorable Cute South Korean Dog Gets Neck Massa...,Adorable Cute South Korean Dog Gets Neck Massage,UCT7HvytJiHm4BITiIQU1zwg,CloudSheep,...,2d,False,{},{},"[/m/0b_nd, /m/0bt9lr]",[/m/04h3kx],201,227149,189,"[75, 25, 18, 18, 15, 5, 10, 2, 0, 1, 2, 0, 1, ..."
2,00bumpN0Mhw,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[583, 316, 178, 107, 92, 84, 135, 122, 133, 90...","[250, 146, 94, 51, 55, 54, 79, 63, 66, 51, 35,...","[235606.866667, 177160.183333, 133966.883333, ...","[89627, 63829, 46999, 42951, 42902, 36271, 416...","Download ""The Pressure"" on iTunes: http://smar...",Jhené Aiko - The Pressure (Explicit),UCkiXOKnkDO4qz3njQoz86LA,JheneAikoVEVO,...,2d,False,[DE],{},"[/m/011rvmhv, /m/0nhwg69]","[/m/011ccd35, /m/04rlf]",5878,2868185,4533,"[507, 227, 119, 88, 68, 77, 89, 74, 59, 41, 41..."
3,00f0ct78HcU,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[38, 11, 4, 4, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0...","[386, 194, 59, 16, 12, 5, 4, 2, 9, 5, 8, 9, 8,...","[119054.283333, 68698.1166667, 34243.9166667, ...","[91431, 52484, 26459, 14774, 7700, 5314, 4872,...",DON'T WANT TO MISS A PRANK? Subscribe here! \n...,Tank Explosion Accident!,UCuAcfy-GKYd1SPj8iEtA3Zw,Dudesons,...,2d,False,{},{},[/m/04cb12],"[/m/0bqb0z, /m/014zdl, /m/07cmd]",73,444387,192,"[39, 20, 5, 3, 4, 1, 2, 1, 2, 3, 2, 1, 0, 2, 0..."
4,00g2ZbI3ung,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[18, 3, 1, 1, 5, 5, 3, 2, 1, 1, 1, 0, 2, 1, 0,...","[63, 20, 5, 15, 19, 16, 18, 17, 12, 8, 12, 15,...","[11655.7333333, 3733.05, 2094.01666667, 1368.9...","[15512, 5027, 2691, 1861, 2563, 3891, 2751, 22...",These miracles will make you a god damn sexual...,Dark Souls II: Don't use hexes PSA,UCwPnX0XSfxswF0rL78NzNIw,veselekov,...,2d,False,{},{},"[/m/0p8xwrr, /m/0dgs3gt]","[/m/06zm8z, /m/02dpv4]",123,237425,186,"[60, 3, 1, 0, 1, 2, 4, 2, 2, 2, 0, 0, 1, 1, 0,..."


In [6]:
df = df[['YoutubeID', 'watchTime', 'dailyViewcount', 'description', 
        'title', 'channelId', 'channelTitle', 'category', 'uploadDate', 
        'duration', 'definition', 'dimension', 'caption', 'regionRestriction.blocked', 
        'regionRestriction.allowed', 'topicIds', 'relevantTopicIds', 'totalViewcount']]
df.head()

Unnamed: 0,YoutubeID,watchTime,dailyViewcount,description,title,channelId,channelTitle,category,uploadDate,duration,definition,dimension,caption,regionRestriction.blocked,regionRestriction.allowed,topicIds,relevantTopicIds,totalViewcount
0,00-6OyXVA0M,"[59311.9833333, 455426.75, 206253.816667, 1119...","[90158, 695628, 312298, 170348, 82346, 61530, ...",Bill's response to Victoria Osteen's congregat...,Bill Cosby responds to Victoria Osteen,UCpzGw-b-rEPjRs2LT0ZVHTQ,Bryant Harris,Entertainment,2014-08-29 06:19:14,PT39S,sd,2d,False,{},{},"[/m/03bxbhz, /m/03ny9x, /m/014zfs]",[/m/06bvp],2174286
1,00ATf2HR-FA,"[2937.13333333, 9039.33333333, 5082.71666667, ...","[14939, 40664, 21261, 30914, 31073, 20395, 117...",Adorable Cute South Korean Dog Gets Neck Massa...,Adorable Cute South Korean Dog Gets Neck Massage,UCT7HvytJiHm4BITiIQU1zwg,CloudSheep,Pets & Animals,2014-08-29 07:24:00,PT15S,sd,2d,False,{},{},"[/m/0b_nd, /m/0bt9lr]",[/m/04h3kx],227149
2,00bumpN0Mhw,"[235606.866667, 177160.183333, 133966.883333, ...","[89627, 63829, 46999, 42951, 42902, 36271, 416...","Download ""The Pressure"" on iTunes: http://smar...",Jhené Aiko - The Pressure (Explicit),UCkiXOKnkDO4qz3njQoz86LA,JheneAikoVEVO,Music,2014-09-03 02:00:05,PT4M22S,hd,2d,False,[DE],{},"[/m/011rvmhv, /m/0nhwg69]","[/m/011ccd35, /m/04rlf]",2868185
3,00f0ct78HcU,"[119054.283333, 68698.1166667, 34243.9166667, ...","[91431, 52484, 26459, 14774, 7700, 5314, 4872,...",DON'T WANT TO MISS A PRANK? Subscribe here! \n...,Tank Explosion Accident!,UCuAcfy-GKYd1SPj8iEtA3Zw,Dudesons,Entertainment,2014-06-14 03:00:04,PT1M38S,sd,2d,False,{},{},[/m/04cb12],"[/m/0bqb0z, /m/014zdl, /m/07cmd]",444387
4,00g2ZbI3ung,"[11655.7333333, 3733.05, 2094.01666667, 1368.9...","[15512, 5027, 2691, 1861, 2563, 3891, 2751, 22...",These miracles will make you a god damn sexual...,Dark Souls II: Don't use hexes PSA,UCwPnX0XSfxswF0rL78NzNIw,veselekov,Film & Animation,2014-06-14 11:32:27,PT53S,hd,2d,False,{},{},"[/m/0p8xwrr, /m/0dgs3gt]","[/m/06zm8z, /m/02dpv4]",237425


In [7]:
min_days = min(df['watchTime'].apply(lambda x: len(x)))
days = [f'day_{i}' for i in range(1, min_days + 1)]
timeline = pd.DataFrame(index=df['YoutubeID'], columns=days)

In [8]:
for i in tqdm(range(min_days)):
    timeline[f'day_{i + 1}'] = df['dailyViewcount'].str[i].values.astype('int64')

100%|████████████████████████████████████████████████████████████████████████████████| 119/119 [00:05<00:00, 22.48it/s]


In [9]:
timeline.head()

Unnamed: 0_level_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,...,day_110,day_111,day_112,day_113,day_114,day_115,day_116,day_117,day_118,day_119
YoutubeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00-6OyXVA0M,90158,695628,312298,170348,82346,61530,76116,229360,109262,40695,...,493,452,452,423,1056,1407,1153,1081,664,576
00ATf2HR-FA,14939,40664,21261,30914,31073,20395,11713,7690,5526,6259,...,107,476,208,140,102,135,150,120,94,68
00bumpN0Mhw,89627,63829,46999,42951,42902,36271,41669,44901,37811,39228,...,14816,14153,14232,14372,12938,10972,11921,12811,12158,12724
00f0ct78HcU,91431,52484,26459,14774,7700,5314,4872,4604,4065,3843,...,512,578,601,842,1042,951,651,678,605,579
00g2ZbI3ung,15512,5027,2691,1861,2563,3891,2751,2234,2233,2034,...,540,667,693,836,874,937,983,1015,853,860


In [19]:
pred_val = 119
drops = [f'day_{i}' for i in range(pred_val, 120)]
y = timeline[drops[0]]
X = timeline.drop(columns=drops)

In [20]:
def approx(x : np.array, a : float, b : float, c : float, d: float) -> np.array:
    """
    Counts approximate trend line
    """
#     return a / np.exp(x)
    return a / (c + x ** 2)


season = []
trend = []
cnt = 0

for row in tqdm(X.iterrows()):   
    value = X.loc[row[0]].values
#     plt.plot(X.loc[row[0]].values)
#     plt.title('Before decompose')
#     plt.show()
#     serie = seasonal_decompose(value, model='additive', period=30)
    x_line = np.arange(1, pred_val)
    params, trash = curve_fit(approx, x_line, value, maxfev=100000)
    y_line = approx(x_line, *params)
#     season.append(serie.seasonal)
#     trend.append(params)       
#     print(serie.trend)
    X.loc[row[0]] = value - y_line
#     print(y_line)
#     plt.plot(value - y_line)
#     plt.title('After decompose')
#     plt.show()

14041it [01:15, 185.82it/s]


In [21]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(data=scaler.fit_transform(X), index=X.index, columns=X.columns)

In [15]:
def _model(X: pd.DataFrame, y: pd.Series):
    batch = ceil(X.shape[0] / 3)
    pred = []
    test = []
    MAE = []
    R2 = []

    models_sap = []
    cnt = 0
    for i in tqdm(range(1, 4)):
        X_test = X.reset_index(drop=True)[batch * (i - 1) : batch * i]
        if X_test.shape[0] == 0:
            continue
        y_test = y.reset_index(drop=True)[X_test.index]
        X_valid = X.iloc[np.random.choice(X.reset_index(drop=True).drop(index=X_test.index.tolist()).index, 
                                                 batch, replace=False)].reset_index(drop=True)
        y_valid = y.reset_index(drop=True)[X_valid.index]
        X_train = X.reset_index(drop=True).drop(index=[*X_test.index.to_list(), *X_valid.index.to_list()])
        y_train = y.reset_index(drop=True)[X_train.index]


        d = [3, 5]
        alpha = np.logspace(-0.3, -0.001, 5, dtype='float')
        n = [70, 100]
        searcher = GridSearchCV(GradientBoostingRegressor(init=Ridge(), max_features='sqrt', loss='huber', random_state=21), 
                                [{"n_estimators": n, "max_depth" : d, "alpha" : alpha}], 
                                scoring="neg_mean_absolute_error", cv=4)
        searcher.fit(X_valid, y_valid)
        best_n = searcher.best_params_["n_estimators"]
        best_d = searcher.best_params_["max_depth"]
        best_a = searcher.best_params_["alpha"]
        print("Best n = %.8f" % best_n)
        print("Best d = %.8f" % best_d)
        print("Best a = %.8f" % best_a)

        model = GradientBoostingRegressor(warm_start=False, alpha=best_a, max_depth=best_d, n_estimators=best_n,
                                          init=Ridge(), max_features='sqrt', random_state=21)

        model.fit(X_train, y_train)
        models_sap.append(model)

        model = models_sap[cnt]
        cnt += 1
        y_test = y_test.values
        y_pred = model.predict(X_test)

        pred.append(y_pred)
        test.append(y_test)
        print('MAE:', mean_absolute_error(y_pred, y_test))
        print('R2:', r2_score(y_pred, y_test))
        MAE.append(mean_absolute_error(y_pred, y_test))
        R2.append(r2_score(y_pred, y_test))
#         plt.figure(figsize=(30, 6))
#         plt.title('Batch #' + str(i))
#         plt.plot(y_pred, label='pred')
#         plt.plot(y_test, label='test')
#         plt.xticks(X_test.index, X_scaled.iloc[X_test.index].index, rotation=90)
#         plt.legend()
#         plt.show()

    print('Mean MAE:', np.mean(MAE))
    print('Mean R2:', np.mean(R2))
    
    return models_sap, pred

In [22]:
models, pred = _model(X_scaled, y)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Best n = 100.00000000
Best d = 3.00000000
Best a = 0.50118723


 33%|███████████████████████████▋                                                       | 1/3 [04:22<08:45, 262.87s/it]

MAE: 543.8890759862571
R2: 0.9840584435685025
Best n = 100.00000000
Best d = 3.00000000
Best a = 0.59531935


 67%|███████████████████████████████████████████████████████▎                           | 2/3 [08:33<04:15, 255.68s/it]

MAE: 637.9835533586139
R2: 0.9903776394363103
Best n = 100.00000000
Best d = 3.00000000
Best a = 0.50118723


100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [12:44<00:00, 254.90s/it]

MAE: 457.4132802439795
R2: 0.9576546077788531
Mean MAE: 546.4286365296168
Mean R2: 0.9773635635945553



