# Prediction using XGBoost

In [1]:
import xgboost

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

In [2]:
# define model
model = xgboost.XGBRegressor()

In [25]:
data = {'time': [1,2,3,4,5],
       'measure': [3,6,9,12,15]}

In [3]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test, :], data[-n_test:, :]

In [4]:
# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	#n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols = list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
	# put it all together
	agg = pd.concat(cols, axis=1)
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg.values

In [5]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
	predictions = list()
	# split dataset
	# seed history with training dataset
	history = [x for x in data]
	# step over each time-step in the test set
	for i in range(len(data)):
		# split test row into input and output columns
		testX, testy = data[i, :-1], data[i, -1]
		print(testX.shape, testy.shape)
		# fit model on history and make a prediction
		yhat = xgboost_forecast(history, testX)
		# store forecast in list of predictions
		predictions.append(yhat)
		# add actual observation to history for the next loop
		history.append(data[i])
		# summarize progress
		print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
	# estimate prediction error
	error = mean_absolute_error(test[:, -1], predictions)
	return error, test[:, 1], predictions

In [6]:
# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	#print(trainX, trainy)    
	# fit model
	model = xgboost.XGBRegressor(objective='reg:squarederror', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	print('testX: ',testX)
	yhat = model.predict(np.array([testX]))
	return yhat[0]

In [7]:
df = pd.read_csv('../../data/popular_data.csv')

In [8]:
df['time_til_next_change'] = df.groupby(['infobox_key', 'property_name'])['days_diff'].shift(-1)

In [9]:
df['time_til_next_change'] = df['time_til_next_change'].fillna(0)

In [10]:
df['time_til_next_change'] = df['time_til_next_change'].apply(lambda x: pd.Timedelta(x).total_seconds()/(3600*24))

In [11]:
df['time_til_next_change'] = df['time_til_next_change'].apply(lambda x: round(x))

In [12]:
train = df[(df['page_title']=='David Beckham') & (df['property_name']=='caps(goals)')]['time_til_next_change']

In [13]:
train

1793288      1
1793289     13
1793290      1
1793291    135
1793292      6
          ... 
1793391     14
1793392      0
1793656     23
1793657      7
1793658      0
Name: time_til_next_change, Length: 108, dtype: int64

In [14]:
last_update = df[(df['page_title']=='David Beckham') & (df['property_name']=='caps(goals)')]['timestamp'].iloc[-1]

In [15]:
last_update = pd.to_datetime(last_update)

In [16]:
last_update

Timestamp('2009-04-27 21:42:25+0000', tz='UTC')

In [17]:
len(train)

108

In [18]:
supervised_train = series_to_supervised(train.values, n_in=1)

In [19]:
supervised_train

array([[  1.,  13.],
       [ 13.,   1.],
       [  1., 135.],
       [135.,   6.],
       [  6.,  62.],
       [ 62.,  47.],
       [ 47.,  13.],
       [ 13.,   2.],
       [  2.,  16.],
       [ 16.,  47.],
       [ 47.,  21.],
       [ 21.,  15.],
       [ 15.,   7.],
       [  7.,  23.],
       [ 23.,   3.],
       [  3.,   3.],
       [  3.,   9.],
       [  9.,   3.],
       [  3.,   5.],
       [  5.,   7.],
       [  7.,   4.],
       [  4.,   1.],
       [  1.,   7.],
       [  7.,  14.],
       [ 14.,   7.],
       [  7.,  31.],
       [ 31.,   0.],
       [  0.,  36.],
       [ 36.,   4.],
       [  4.,   6.],
       [  6.,   6.],
       [  6.,   2.],
       [  2.,   1.],
       [  1.,   1.],
       [  1.,   5.],
       [  5.,   2.],
       [  2.,   5.],
       [  5.,   1.],
       [  1.,   1.],
       [  1.,   1.],
       [  1.,   9.],
       [  9.,   1.],
       [  1.,   6.],
       [  6.,   5.],
       [  5.,   4.],
       [  4.,   1.],
       [  1.,   7.],
       [  7.,

In [20]:
mae, y, yhat = walk_forward_validation(supervised_train, 7)

(1,) ()


  "memory consumption")


testX:  [1.]
>expected=13.0, predicted=15.9
(1,) ()
testX:  [13.]
>expected=1.0, predicted=3.0
(1,) ()
testX:  [1.]
>expected=135.0, predicted=15.7
(1,) ()
testX:  [135.]
>expected=6.0, predicted=6.0
(1,) ()
testX:  [6.]
>expected=62.0, predicted=14.6
(1,) ()
testX:  [62.]
>expected=47.0, predicted=47.0
(1,) ()
testX:  [47.]
>expected=13.0, predicted=17.0
(1,) ()
testX:  [13.]
>expected=2.0, predicted=2.5
(1,) ()
testX:  [2.]
>expected=16.0, predicted=6.5
(1,) ()
testX:  [16.]
>expected=47.0, predicted=32.0
(1,) ()
testX:  [47.]
>expected=21.0, predicted=15.7
(1,) ()
testX:  [21.]
>expected=15.0, predicted=15.0
(1,) ()
testX:  [15.]
>expected=7.0, predicted=7.0
(1,) ()
testX:  [7.]
>expected=23.0, predicted=10.1
(1,) ()
testX:  [23.]
>expected=3.0, predicted=5.0
(1,) ()
testX:  [3.]
>expected=3.0, predicted=8.0
(1,) ()
testX:  [3.]
>expected=9.0, predicted=7.5
(1,) ()
testX:  [9.]
>expected=3.0, predicted=5.2
(1,) ()
testX:  [3.]
>expected=5.0, predicted=7.7
(1,) ()
testX:  [5.]
>expec

NameError: name 'test' is not defined