In [108]:
# Parameters
top_folder = "../data/top_collections"


In [109]:
%load_ext autoreload
%autoreload 2


import os
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from xai.utils.data import make_dir
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, explained_variance_score


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [110]:
cuts = [f'2022-0{i}-01' for i in range(1,4)]
validation_days = 30
training_days = 30*6

periods = [
    {
        'training': {
            'start': datetime.datetime.fromisoformat(cut) - datetime.timedelta(days=training_days),
            'end': datetime.datetime.fromisoformat(cut)
        },
        'validation': {
            'start': datetime.datetime.fromisoformat(cut),
            'end': datetime.datetime.fromisoformat(cut) + datetime.timedelta(days=validation_days)
        },
        'period': cut
    }
    for cut in cuts
]

def select(df, start, end):
    w = (
        (df['timestamp'] < end) &
        (df['timestamp'] >= start)
    )
    return df[w]

In [111]:
df_t = pd.read_parquet(os.path.join(top_folder, 'asset_traits.parquet'), engine='pyarrow')
df_a = pd.read_parquet(os.path.join(top_folder, 'assets.parquet'), engine='pyarrow')

df_s = pd.read_parquet(os.path.join(top_folder, 'sales_with_trend.parquet'), engine='pyarrow')


df_s['outlier'] = ((df_s['price_eth'] < 0.8 * df_s['price_eth_floor_500']) | 
(df_s['price_eth'] > 4 * df_s['price_eth_ceiling_500']))

df_s = df_s[~df_s['outlier']]
df_s = df_s.dropna(subset=["timestamp", 'price_eth'])

In [None]:
df_t['frequency'] = df_t['trait_count'] / 10000
ranks = df_t['trait_type'].unique()
ranks = sorted(ranks)
ranks = {
    i: r for i,r in enumerate(ranks)
}
df_t['rank'] = df_t['trait_type'].map(ranks)
columns = ['trait_type', 'trait_value', 'trait_count', 'rank', 'frequency']
df_tg = df_t.groupby('asset_id')[columns].apply(lambda df: df.to_dict(orient='records'))
df_tg.name = 'traits'

df_as = df_a.set_index('asset_id')[['token_id', 'collection',
       'image_url', 'image_preview_url', 'image_thumbnail_url']]

df_aa = df_as.join(df_tg)

In [112]:
df_t_enc = df_t.pivot_table(
    index='asset_id', columns='trait_id', values='trait_value', aggfunc='count') \
    .fillna(0).astype(int)
feature_names = df_t_enc.columns.tolist()
df_t_enc = df_t_enc.reset_index()
df_p_enr = df_t_enc.merge(df_s)

In [113]:
df_p_enr['log_price_eth_scaled_50'] = np.log(df_p_enr['price_eth_scaled_50'])

In [114]:
target = 'price_eth_scaled_50'

df = df_p_enr[feature_names + [target, 'asset_id', 'timestamp']]

for p in periods:
    df_training = select(df, **p['training']).groupby('asset_id').mean()
    df_validation = select(df, **p['validation']).groupby('asset_id').mean()
    model = LinearRegression()
    model.fit(df_training[feature_names], df_training[target])

    train_pred = model.predict(df_training[feature_names])
    val_pred = model.predict(df_validation[feature_names])

    r2_train = r2_score(df_training[target], train_pred)
    r2_val = r2_score(df_validation[target], val_pred)
    expvar_train = explained_variance_score(df_training[target], train_pred)
    expvar_val = explained_variance_score(df_validation[target], val_pred)

    print(f'Validation starting with {p["period"]}')
    print(f'In-Sample R2 {r2_train}')
    print(f'Out-Sample R2 {r2_val}')
    print(f'In-Sample exp var {expvar_train}')
    print(f'Out-Sample exp var {expvar_val}')


Validation starting with 2022-01-01
In-Sample R2 0.7888990194451074
Out-Sample R2 -683081574248268.9
In-Sample exp var 0.7888999040302298
Out-Sample exp var -682215818471132.2
Validation starting with 2022-02-01
In-Sample R2 0.7582558369593333
Out-Sample R2 0.5683933383248235
In-Sample exp var 0.7582564101258242
Out-Sample exp var 0.6027452184438364
Validation starting with 2022-03-01
In-Sample R2 0.7476465213105351
Out-Sample R2 0.38092143172722814
In-Sample exp var 0.7476488313878689
Out-Sample exp var 0.3828784975152839
