In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import datetime

In [None]:
df_pop = pd.read_csv('../../../data/popular_data.csv')
df_pop

In [None]:
df_pop = df_pop[['infobox_key', 'property_name', 'template', 'timestamp','days_diff']]
df_pop.head(3)

In [None]:
df_pop["timestamp"]=df_pop["timestamp"].astype("datetime64",copy=True)

In [None]:
df_pop.dtypes

In [None]:
df_pop = df_pop[df_pop["template"] == "football player infobox"]
df_pop = df_pop[df_pop["property_name"] == "caps(goals)"]
df_pop

In [None]:
# get boxes with >=10 changes
df_pop_many = df_pop.groupby('infobox_key')['property_name'].count().reset_index().rename(columns={'property_name':'count'})
pop_keys = df_pop_many[df_pop_many['count']>=10]['infobox_key'].tolist()

In [None]:
df_pop = df_pop[df_pop['infobox_key'].isin(pop_keys)]
df_pop.groupby("infobox_key").count()

In [None]:
# convert timestamp to day of year
df_pop['day_of_year'] = df_pop['timestamp'].dt.dayofyear
df_pop['day_of_month'] = df_pop['timestamp'].dt.day
df_pop['day_of_week'] = df_pop['timestamp'].dt.dayofweek
df_pop['month_of_year'] = df_pop['timestamp'].dt.month
df_pop['quarter_of_year'] = df_pop['timestamp'].dt.quarter
df_pop['is_quarter_start'] = df_pop['timestamp'].dt.is_quarter_start
df_pop['is_quarter_end'] = df_pop['timestamp'].dt.is_quarter_end
df_pop['is_month_start'] = df_pop['timestamp'].dt.is_month_start
df_pop['is_month_end'] = df_pop['timestamp'].dt.is_month_end


df_pop['days_since_last_change'] = df_pop["timestamp"]-df_pop.groupby(['infobox_key', 'property_name'])['timestamp'].shift(+1).fillna(pd.Timestamp('20990101'))
df_pop['days_since_last_change'] = df_pop['days_since_last_change'].apply(lambda x: round(pd.Timedelta(x).total_seconds()/(3600*24)))
df_pop['days_since_last_change'][df_pop['days_since_last_change']<0]=-1
df_pop['days_since_last_2_changes'] = df_pop["timestamp"]-df_pop.groupby(['infobox_key', 'property_name'])['timestamp'].shift(+2).fillna(pd.Timestamp('20990101'))
df_pop['days_since_last_2_changes'] = df_pop['days_since_last_2_changes'].apply(lambda x: round(pd.Timedelta(x).total_seconds()/(3600*24)))
df_pop['days_since_last_2_changes'][df_pop['days_since_last_2_changes']<0]=-1
df_pop['days_since_last_3_changes'] = df_pop["timestamp"]-df_pop.groupby(['infobox_key', 'property_name'])['timestamp'].shift(+3).fillna(pd.Timestamp('20990101'))
df_pop['days_since_last_3_changes'] = df_pop['days_since_last_3_changes'].apply(lambda x: round(pd.Timedelta(x).total_seconds()/(3600*24)))
df_pop['days_since_last_3_changes'][df_pop['days_since_last_3_changes']<0]=-1

df_pop['days_until_next_change'] = df_pop.groupby(['infobox_key', 'property_name'])['days_since_last_change'].shift(-1)
df_pop['days_until_next_change'] = pd.to_numeric(df_pop['days_until_next_change'].fillna(-1),downcast="integer")
df_pop['days_between_last_and_2nd_to_last_change'] = df_pop.groupby(['infobox_key', 'property_name'])['days_since_last_change'].shift(+1)
df_pop['days_between_last_and_2nd_to_last_change'] = pd.to_numeric(df_pop['days_between_last_and_2nd_to_last_change'].fillna(-1),downcast="integer")
df_pop['mean_change_frequency'] = df_pop.groupby(['infobox_key', 'property_name'])['days_since_last_change'].apply(lambda x: x.iloc[0:1].append(x.iloc[1:].expanding().mean()))
df_pop

In [None]:
df_pop.head(20)

In [None]:
# used to slice last 2 entries per group
def __groupby_slice( _grp, start=0, stop=None, step=1):
    '''
    Applies a slice to a GroupBy object
    '''
    return _grp.apply( lambda _df : _df.iloc[start:stop:step]).reset_index(drop=True)

In [None]:
train_set = __groupby_slice(df_pop.groupby('infobox_key'), 0, -2)
train_set.shape

In [None]:
test_set = __groupby_slice(df_pop.groupby('infobox_key'), -2)
test_set.shape

In [None]:
train_set.head(3)

In [None]:
# type(train_set.iloc[0]["timestamp"])
dt=datetime.datetime.strptime("2007-01-22 19:17:53","%Y-%m-%d %H:%M:%S")
dt.weekday()
# dt.day
# dt.month
# dt.month%3
train_set["timestamp"].astype("datetime64")

In [None]:
from math import ceil

def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """

    first_day = dt.replace(day=1)

    dom = dt.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))

dt=datetime.datetime.strptime("2021-08-02 9:17:53","%Y-%m-%d %H:%M:%S")
week_of_month(dt)

## Training

In [None]:
X_pop_train = pd.get_dummies(train_set["infobox_key"])
X_pop_train["time_from_prev_change"] = train_set['time_from_prev_change'].copy()
X_pop_train['day_of_year'] = train_set['day_of_year'].copy()

y_pop_train = train_set['time_til_next_change'].copy()


X_pop_test = pd.get_dummies(test_set["infobox_key"])
X_pop_test["time_from_prev_change"] = test_set['time_from_prev_change'].copy()
X_pop_test['day_of_year'] = test_set['day_of_year'].copy()

y_pop_test = test_set['time_til_next_change'].copy()


In [None]:
X_pop_train

In [None]:
X_pop_train= pd.DataFrame(train_set['time_from_prev_change'].copy())
X_pop_train['day_of_year'] = train_set['day_of_year'].copy()

y_pop_train = train_set['time_til_next_change'].copy()


X_pop_test = pd.DataFrame(test_set['time_from_prev_change'].copy())
X_pop_test['day_of_year'] = test_set['day_of_year'].copy()

y_pop_test = test_set['time_til_next_change'].copy()

In [None]:
clf_xgb_pop = xgb.XGBRegressor(objective='reg:squarederror', missing=0, seed=42, n_estimators=100,max_depth=10)
clf_xgb_pop.fit(X_pop_train, y_pop_train, verbose=True, early_stopping_rounds=10, eval_set=[(X_pop_test, y_pop_test)])

In [None]:
xgb.to_graphviz(clf_xgb_pop, num_trees=0, size='10,10')