In [213]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

## Meta 

In [4]:
token_meta = pd.read_csv('token_metadata.csv')
token_meta.fillna('unknown', inplace=True)
token_meta['text'] = token_meta[['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear',
       'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear',
       'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name']].apply(lambda x: ' '.join(x), axis=1)
token_meta = token_meta[['token_index','rarity_score', 'text']]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(token_meta.text.values)

In [187]:
col_names = [f"emb_{i}" for i in range(embeddings.shape[1])]
df_emb = pd.DataFrame(embeddings, columns=col_names)
df_emb['token_index'] = token_meta['token_index']
df_emb['rarity_score'] = token_meta['rarity_score']

## Sales

In [297]:
token_sale = pd.read_csv('token_sales.csv')

In [298]:
df_sorted = token_sale.sort_values(by='timestamp', ascending=False)
grouped = df_sorted.groupby('token_index')
grouped = grouped[['token_index', 'eth']].first()
grouped = grouped.reset_index(drop=True)
result = pd.merge(grouped, df_emb, on='token_index', how='inner')
y = result.eth

In [232]:
result.drop(['token_index', 'eth'], inplace=True, axis=1)

# Model

In [222]:
# Define the MdAPE metric as a custom scoring function
def mdape(y_true, y_pred):
    return np.median(np.abs((y_true - y_pred) / y_true)) * 100

In [223]:
# Create the XGBoost regression model
model = xgb.XGBRegressor()

In [224]:
# Define the scoring function using MdAPE
scoring = make_scorer(mdape, greater_is_better=False)

In [225]:
# Perform cross-validation and calculate MdAPE scores
mdape_scores = -cross_val_score(model, result, y, cv=5, scoring=scoring)

In [226]:
# Average the MdAPE scores to get the final evaluation result
mean_mdape = np.mean(mdape_scores)

In [227]:
mean_mdape

62.92366432684823

# New features

In [307]:
df_sorted = token_sale.sort_values(by='timestamp', ascending=False)
grouped = df_sorted.groupby('token_index')
grouped = grouped[['token_index', 'eth']].first()
grouped = grouped.reset_index(drop=True)
result = pd.merge(grouped, df_emb, on='token_index', how='inner') 

In [308]:
last_elements_ = df_sorted.groupby('token_index').tail(-1) 

In [309]:
mean_l = last_elements_.groupby('token_index')[['eth','usd']].mean()
mean_l = mean_l.rename(columns={'eth': 'eth_mean', 'usd': 'usd_mean'})
median_l = last_elements_.groupby('token_index')[['eth','usd']].median()
median_l = median_l.rename(columns={'eth': 'eth_median', 'usd': 'usd_median'})
var_l = last_elements_.groupby('token_index')[['eth','usd']].std()
var_l = var_l.rename(columns={'eth': 'eth_std', 'usd': 'usd_std'})
var_l.reset_index(inplace=True)
median_l.reset_index(inplace=True)
mean_l.reset_index(inplace=True)

In [318]:
result_new = pd.merge(result, mean_l, on='token_index', how='left')
result_new = pd.merge(result_new, median_l, on='token_index', how='left')
result_new = pd.merge(result_new, var_l, on='token_index', how='left')
y = result_new.eth

In [319]:
result_new.drop(['token_index', 'eth'], inplace=True, axis=1)

In [315]:
model = xgb.XGBRegressor()

In [320]:
# Perform cross-validation and calculate MdAPE scores
mdape_scores = -cross_val_score(model, result_new, y, cv=5, scoring=scoring)

In [321]:
mean_mdape = np.mean(mdape_scores)

In [322]:
mean_mdape

54.53775678282814