In [104]:
import pandas as pd
import os
import pickle
import numpy as np
import sparkbeyond as sb
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.model_selection import GroupShuffleSplit

backtest_on = False

In [105]:
# Load player database
data_dir = r'/Users/joshualowe/Documents/Misc/Football_Index/00_Data/00_global_player_database/'
os.chdir(data_dir)

def load_obj(file_name):
    with open(file_name + '.pkl', 'rb') as f:
        return pickle.load(f)

df = load_obj('FI_global_player_price_data_full_prem_v2')

In [106]:
# create the target - 2 week lookahead
names_to_remove = []
df_complete = []
for name, data in df.items(): 

    # remove items with no data
    if data.avgPrice.isna().sum() > 1:
        names_to_remove.append(name)
    
    data['Target_2wk'] = data['avgPrice'].shift(-7)
    data['Target_2wk_delta'] = data['Target_2wk'] - data['avgPrice']
    data['target_2wk_perc_minus_com'] = ((100/data['avgPrice']) * data['Target_2wk_delta'] ) - 2

    data['id'] = name
    df_complete.append(data.dropna(axis=0))
# remove items with no data
for n in names_to_remove:
    df.pop(n)

df_complete = pd.concat(df_complete).reset_index(drop = True)


In [107]:
df_complete.to_csv('naive_ds.csv')

In [108]:
#Credentials 
server_url_    = ''
api_token     = ''
client = sb.SparkBeyondClient(server_url=server_url_, api_key=api_token)

Getting build info
Server version: 1.28.0


In [109]:
data_run = df_complete[['DateTime','Volume','maxPrice','minPrice','avgPrice', 'target_2wk_perc_minus_com', 'id']]

# turn into a classification
# data_run['target_2wk_perc_minus_com'] = np.where(data_run['target_2wk_perc_minus_com'] > 0, 1, 0)

data_run.columns = data_run.columns.str.lower()
data_run.columns.name = 0

# train test split on id groups
train_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7).split(data_run, groups=data_run['id']))
train = data_run.iloc[train_inds]
test = data_run.iloc[test_inds]

In [1]:
# settings
TS = sb.TimeWindowDefinition(date_col='datetime', window=7, unit= 'Days', key_col='id')

#add a context dataset with a time window hook
timeseries_context = sb.Contexts.TimeSeriesMap(data=data_run[['datetime','volume','maxprice','minprice','avgprice', 'id']],
                                               time_column = 'datetime', name = '7_day_window', key_column='id') #for keyed ts need to use TimeSeriesMap

contexts_list = [timeseries_context]

feature_generation_object = sb.FeatureGeneration(missing_values_allowed=0, auto_column_sub_sets='NUMERIC_PAIRS',
                                                max_depth=3.5, time_series_interaction=True)
problem_definition_object = sb.ProblemDefinition(time_windows_definition=[TS])
model_building_object = sb.ModelBuilding(algorithms_white_list=[sb.Algorithms.scikit.xgboost()])

#Learn node
model = client.learn(
    project_name = 'FI_Josh_Lowe',
    train_data = train,
    target = 'target_2wk_perc_minus_com',
    test_data  = test,
    context_datasets=contexts_list,
    feature_generation=feature_generation_object, 
    problem_definition=problem_definition_object, 
    model_building=model_building_object, 
    run_blocking=False, 
    revision_description='Naive'
)
    


NameError: name 'sb' is not defined

In [111]:
prediction = model.predict(test[['datetime','volume','maxprice','minprice','avgprice', 'id']])


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  hash_md5.update(batch.to_msgpack())


Saving Dataset-addb5ff1b79d12316a274deb0c5e4458.tsv.gz to /var/folders/q3/rt0yr3fd3zn5hk_spfsmql1r0000gn/T/tmp737bdy72/Dataset-addb5ff1b79d12316a274deb0c5e4458.tsv.gz before upload
File /var/folders/q3/rt0yr3fd3zn5hk_spfsmql1r0000gn/T/tmp737bdy72/Dataset-addb5ff1b79d12316a274deb0c5e4458.tsv.gz size 61704 bytes. Executed in 0 seconds
Uploading Dataset-addb5ff1b79d12316a274deb0c5e4458.tsv.gz
100%|██████████| 61.7k/61.7k [00:00<00:00, 62.1MB/s]
Blocking until prediction result is available.
Prediction job is running. So far processed 7200 out of 12030 rows

Prediction has finished successfully. Processed 12030 out of 12030  rows. Started: 2019-08-20 15:45:50. Ended: 2019-08-20 15:45:57
22.0B [00:00, 203B/s]


In [118]:
prediction_result =  prediction.data
prediction_result = pd.concat((test.reset_index(drop=True), prediction_result.reset_index(drop=True)), axis=1)
prediction_result.head(4)

Unnamed: 0,datetime,volume,maxprice,minprice,avgprice,target_2wk_perc_minus_com,id,target_2wk_perc_minus_com_predicted
0,2019-07-15,0.0,0.25,0.25,0.25,-2.0,neto,-1.130188
1,2019-07-16,0.0,0.25,0.25,0.25,-2.0,neto,0.469192
2,2019-07-17,0.0,0.25,0.25,0.25,-2.0,neto,0.495186
3,2019-07-18,0.0,0.25,0.25,0.25,-2.0,neto,0.495186


In [159]:
prediction_result['position_lots']   = (100/prediction_result['avgprice']).astype(int)
prediction_result['predicted_value'] = prediction_result['position_lots'] * prediction_result['avgprice'] * abs((1+ (prediction_result['target_2wk_perc_minus_com_predicted']/100)))
prediction_result['actual_value'] = prediction_result['position_lots'] * prediction_result['avgprice'] * abs((1+ (prediction_result['target_2wk_perc_minus_com']/100)))
prediction_result['revenue'] = prediction_result['actual_value'] - (prediction_result['avgprice'] * prediction_result['position_lots'])

long_pos = prediction_result[prediction_result['target_2wk_perc_minus_com_predicted'] > 3]
display(long_pos['revenue'].sum())
display(long_pos['spend'].sum())
                                                                            

194.52579999999858

34979.21

In [160]:
long_pos

Unnamed: 0,datetime,volume,maxprice,minprice,avgprice,target_2wk_perc_minus_com,id,target_2wk_perc_minus_com_predicted,position_lots,predicted_value,actual_value,revenue,spend
282,2019-07-27,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,9.228635,526,109.163098,97.9412,-1.9988,99.94
283,2019-07-28,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,9.248026,526,109.182477,97.9412,-1.9988,99.94
284,2019-07-29,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,9.248026,526,109.182477,97.9412,-1.9988,99.94
285,2019-07-30,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,9.248026,526,109.182477,97.9412,-1.9988,99.94
286,2019-07-31,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,9.248026,526,109.182477,97.9412,-1.9988,99.94
287,2019-08-01,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,8.989566,526,108.924172,97.9412,-1.9988,99.94
288,2019-08-02,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,8.989566,526,108.924172,97.9412,-1.9988,99.94
289,2019-08-03,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,8.989566,526,108.924172,97.9412,-1.9988,99.94
290,2019-08-04,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,8.989566,526,108.924172,97.9412,-1.9988,99.94
291,2019-08-05,0.0,0.19,0.19,0.19,-2.000000,marcus-bettinelli,8.989566,526,108.924172,97.9412,-1.9988,99.94
