In [73]:
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm

In [7]:
df_article_codes = pd.read_csv("article_codes.csv")
CATEGORIES = ["Games", "Movies", "Music", "Literary"]
FEATURES = ["dayofweek", "month", "quarter", "dayofyear", "year", "avg", "t-1", "t-2", "t-3", "t-4", "t-5", "t-6", "t-7", "article"]
TARGET = "views"

## Metric

In [49]:
def smape(target, forecast):
    if type(target) == pd.core.frame.DataFrame:
        target = target.values

    denominator = np.abs(target) + np.abs(forecast)
    flag = denominator == 0.

    smape = 2 * np.mean(
        (np.abs(target - forecast) * (1 - flag)) / (denominator + flag)
    )
    return smape

def smape2(target, forecast):
    return "smape", smape(np.array(target), np.array(forecast)), False

## Create Data

In [43]:
def get_fileNames(category):
    file_names = []
    for files in os.listdir("../Data/" + category):
        if files.endswith(".csv"):
            files = files[0:len(files)-4]
            file_names.append(files)
    return file_names

def make_data(category, article):
    df = pd.read_csv(f"../Data/{category}/{article}.csv")
    code = df_article_codes.loc[df_article_codes["article"] == df["article"][0].replace("_", " ")]["code"]
    df = df.drop(["project", "granularity", "access", "agent", "article"], axis=1)
    df["article"] = list(code)[0]

    df.index = df["timestamp"]
    df.index = pd.to_datetime(df.index)
    df.drop(['timestamp'], axis=1, inplace=True)

    df.dropna(inplace=True)

    return df

In [10]:
def create_features(df):
    if isinstance(df, tuple):
        df = df[0]
    df = df.copy()
    df["dayofweek"] = df.index.dayofweek
    df["month"] = df.index.month
    df["quarter"] = df.index.quarter
    df["dayofyear"] = df.index.dayofyear
    df["year"] = df.index.year
    avg = []
    for i in range(1, len(df["views"]) + 1):
        avg.append(df["views"][0:i].mean())
    df["avg"] = avg
    for i in range(1, 8):
        df[f"t-{i}"] = df["views"].shift(i).ffill()
    return df

In [44]:
def pre_process(from_ts, to_ts):
    evals = []
    trains = []
    tests = []
    for file in get_fileNames("Music"):
        df = make_data("Music", file)
        df = df[(df.index >= from_ts) & (df.index <= to_ts)]

        df = create_features(df)
        evals.append(df.tail(14))
        df = df.head(len(df) - 14)
        train_tmp, test_tmp = train_test_split(df, test_size=0.2, shuffle=False)
        trains.append(train_tmp)
        tests.append(test_tmp)

    train = pd.concat(trains)
    test = pd.concat(tests)

    train_X = train[FEATURES]
    train_y = train[TARGET]

    test_X = test[FEATURES]
    test_y = test[TARGET]

    return train_X, train_y, test_X, test_y, evals

## XGBoost

In [54]:
%%time

def train_xg(train_X, train_y, test_X, test_y):
    param = {
        'early_stopping_rounds': 50,
        'disable_default_eval_metric': True,
        'eval_metric': smape,
        'lambda': 8.039526726153133,
        'alpha': 6.59915736568054,
        'colsample_bytree': 0.9148495296506195,
        'subsample': 0.9317347974695944,
        'learning_rate': 0.0002977087190650842,
        'max_depth': 21,
        'min_child_weight': 37,
        'n_estimators': 10000,
        'tree_method': 'gpu_hist'
    }

    xgmodel = xgb.XGBRegressor(**param)
    xgmodel.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], verbose=1000)
    return xgmodel

CPU times: total: 0 ns
Wall time: 0 ns


In [67]:
def eval_xg(evals, xgmodel):
    smapes = []
    evals_copy = evals.copy()
    for eval in evals_copy:
        predictions = []
        indeces = eval.index
        for i in range(1, 8):
            eval[f"t-{i}"][i:] = None
        eval["avg"][1:] = None
        avgs = [eval["avg"][indeces[0]]] * (len(eval) * 5)
        for i in range(0, 13):
            tmp = pd.DataFrame(eval[FEATURES].loc[indeces[i]]).T
            pred = xgmodel.predict(tmp)[0]
            predictions.append(pred)
            avgs.append(pred)

            eval.loc[indeces[i + 1], 'avg'] = (sum(avgs) / len(avgs))
            for j in range(1, 8):
                if i + j < len(indeces):
                    eval.loc[indeces[i + j], f't-{j}'] = pred
        tmp = pd.DataFrame(eval[FEATURES].loc[indeces[13]]).T
        pred = xgmodel.predict(tmp)[0]
        predictions.append(pred)
        eval["predictions"] = predictions

        smapes.append(smape(eval["views"], eval["predictions"]))
    return smapes

## Light GBM

In [56]:
%%time

def train_lgbm(train_X, train_y, test_X, test_y):

    param = {'reg_alpha': 6.961567092065252,
             'reg_lambda': 8.722242210925048,
             'colsample_bytree': 0.9143917935753674,
             'subsample': 0.4211516816907636,
             'learning_rate': 0.007254936193270214,
             'max_depth': 193,
             'num_leaves': 539,
             'min_child_samples': 1,}

    lgbm_model = lightgbm.LGBMRegressor(n_estimators=5000, **param, first_metric_only=True, metric="smape")
    lgbm_model.fit(train_X, train_y, eval_metric=smape2, eval_set=[(test_X, test_y)],verbose=1000 ,early_stopping_rounds=50)
    return lgbm_model

CPU times: total: 0 ns
Wall time: 0 ns


In [51]:
def eval_lgbm(evals, model):
    smapes = []
    evals_copy = evals.copy()
    for eval in evals_copy:
        predictions = []
        indeces = eval.index
        eval["t-1"][1:] = None
        eval["avg"][1:] = None
        avgs = [eval["avg"][indeces[0]]] * (len(eval) * 5)
        for i in range(0, 13):
            tmp = pd.DataFrame(eval[FEATURES].loc[indeces[i]]).T
            pred = model.predict(tmp)[0]
            predictions.append(pred)
            avgs.append(pred)
            eval.loc[indeces[i + 1], 'avg'] = (sum(avgs) / len(avgs))
            for j in range(1, 8):
                if i + j < len(indeces):
                    eval.loc[indeces[i + j], f't-{j}'] = pred
        tmp = pd.DataFrame(eval[FEATURES].loc[indeces[13]]).T
        pred = model.predict(tmp)[0]
        predictions.append(pred)
        eval["predictions"] = predictions
        smapes.append(smape(eval["views"], eval["predictions"]))
    return smapes