# Model 3 is robustness check, it shows adding more features from offer does not improve inferences or accuracy

In [5]:
df = pd.read_csv('Panel_for_Model2.csv')
df.columns

Index(['token_id', 'time_of_sale', 'sale_time', 'price', 'buyer', 'seller',
       'event_count', 'buyer_tscount', 'buyer_act_period', 'buyer_total_value',
       'buyer_total_gasUsed', 'buyer_avg_gasPrice', 'buyer_avg_gasLimit',
       'buyer_rolling_avg_value_last10', 'buyer_rolling_std_value_last10',
       'seller_tscount', 'seller_act_period', 'seller_total_value',
       'seller_total_gasUsed', 'seller_avg_gasPrice', 'seller_avg_gasLimit',
       'seller_rolling_avg_value_last10', 'seller_rolling_std_value_last10',
       'total_offers', 'unique_makers_count', 'mean_offer_price',
       'std_offer_price', 'median_offer_price', 'highest_offer',
       'lowest_offer', 'duration_offer_days', 'rarity.rank', 'Background',
       'Clothes', 'Earring', 'Eyes', 'Fur', 'Hat', 'Mouth'],
      dtype='object')

In [15]:


import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load the dataset
df = pd.read_csv('Panel_for_Model2.csv')
# Handle potential invalid date values with errors='coerce'
# This will convert invalid dates to NaT (Not a Time)
df.drop(columns=['buyer','seller','time_of_sale','event_count','buyer_rolling_avg_value_last10',
                 'seller_rolling_avg_value_last10','buyer_rolling_std_value_last10','seller_rolling_std_value_last10',
                 'median_offer_price','highest_offer','rarity.rank'], inplace=True, axis=1)

# 4. Winsorize and log‐transform the continuous variables
num_cols = [
    'price', 'buyer_tscount', 'buyer_act_period', 'buyer_total_value',
    'buyer_total_gasUsed', 'buyer_avg_gasPrice', 'buyer_avg_gasLimit',
    'seller_tscount', 'seller_act_period', 'seller_total_value',
    'seller_total_gasUsed', 'seller_avg_gasPrice', 'seller_avg_gasLimit',
    'total_offers', 'unique_makers_count', 'mean_offer_price',
    'std_offer_price','lowest_offer', 'duration_offer_days'
]

# winsorzize the 'price' column to 1% and 99%
lower_bound = df['price'].quantile(0.01)
upper_bound = df['price'].quantile(0.99)
df['price'] = np.clip(df['price'], lower_bound, upper_bound)

# log1p transform the 'price' column and rename it to 'log_price'
df['log_price'] = np.log1p(df['price'])
# drop the original 'price' column
df.drop(columns=['price'], inplace=True)
# convert token_id to string
df['token_id'] = df['token_id'].astype(str)


for col in num_cols:
    # Check if column exists before processing
    if col in df.columns:
        # winsorize at 1%–99%
        df[col] = winsorize(df[col], limits=[0.01, 0.01])
        # log1p
        df[f'log_{col}'] = np.log1p(df[col])
        df = df.drop(columns=[col])
        

# drop the numeric columns that do not have a 'log_' prefix
for col in df.select_dtypes(include=[np.number]).columns:
    if not col.startswith('log_'):
        df.drop(columns=col, inplace=True)

df_panel = df.set_index(['token_id', 'sale_time'])

formula = (
    'log_price ~ '
    # Buyer characteristics
    'log_buyer_tscount + log_buyer_act_period + log_buyer_total_value + '
    'log_buyer_total_gasUsed + log_buyer_avg_gasPrice + log_buyer_avg_gasLimit + '
    
    # Seller characteristics
    'log_seller_tscount + log_seller_act_period + log_seller_total_value + '
    'log_seller_total_gasUsed + log_seller_avg_gasPrice + log_seller_avg_gasLimit + '
    
    # Market offer characteristics
    'log_total_offers + log_unique_makers_count + log_mean_offer_price + '
    'log_std_offer_price  + '
    'log_lowest_offer + log_duration_offer_days + '
    
    # NFT categorical traits
    'C(Background) + C(Clothes) + C(Earring) + C(Eyes) + C(Fur) + C(Hat) + C(Mouth) + C(sale_time)'
)


from linearmodels.panel import PanelOLS
import patsy

mdl = smf.ols(formula, data=df).fit()
print(mdl.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                     1697.
Date:                Mon, 28 Apr 2025   Prob (F-statistic):               0.00
Time:                        05:52:27   Log-Likelihood:                -5526.1
No. Observations:               18508   AIC:                         1.151e+04
Df Residuals:                   18278   BIC:                         1.331e+04
Df Model:                         229                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

## Coeffients 

In [22]:
import pandas as pd

# assemble a DataFrame of coefficients and p-values
results = pd.DataFrame({
    'coef': mdl.params,
    'pvalue': mdl.pvalues
})

# drop the intercept
results = results.drop('const', errors='ignore')

# keep only those with p-value < 0.1
sig = results[results['pvalue'] < 0.1]

# exclude any sale time dummies
sig = sig[~sig.index.str.startswith('C(sale_time)')]

# 1) Top 10 largest coefficients (most positive)
top10 = sig.sort_values('coef', ascending=False).head(10)
print("Top 10 largest (coef, p-value):")
print(top10)

# 2) Top 10 smallest coefficients (most negative)
bottom10 = sig.sort_values('coef', ascending=True).head(10)
print("\nTop 10 smallest (coef, p-value):")
print(bottom10)


Top 10 largest (coef, p-value):
                                       coef         pvalue
C(Fur)[T.Solid Gold]               1.171454  6.389025e-191
C(Fur)[T.Trippy]                   0.874782  9.588642e-183
C(Clothes)[T.Black Suit]           0.816887   1.927941e-86
C(Eyes)[T.Blue Beams]              0.765705   1.909493e-85
C(Mouth)[T.Bored Unshaven Pizza]   0.654076   2.201743e-35
C(Mouth)[T.Bored Unshaven Dagger]  0.546143   2.555624e-26
C(Eyes)[T.Laser Eyes]              0.523184   7.031683e-58
C(Hat)[T.King's Crown]             0.469467   1.218565e-58
C(Hat)[T.Trippy Captain's Hat]     0.357175   5.231168e-30
C(Mouth)[T.Grin Diamond Grill]     0.353548   7.687988e-34

Top 10 smallest (coef, p-value):
                                    coef        pvalue
C(Clothes)[T.Service]          -0.106740  6.356016e-07
C(Hat)[T.Fez]                  -0.086860  1.553076e-10
C(Hat)[T.Ww2 Pilot Helm]       -0.085656  3.340829e-04
C(Clothes)[T.Bone Tee]         -0.084961  4.686366e-07
C(Clothes)

In [16]:
import pandas as pd

# assemble a DataFrame of coefficients and p-values
results = pd.DataFrame({
    'coef': mdl.params,
    'pvalue': mdl.pvalues
})

# drop the intercept
results = results.drop('const', errors='ignore')

# keep only those with p-value < 0.1
sig = results[results['pvalue'] < 0.1]

# 1) Top 10 largest coefficients (most positive)
top10 = sig.sort_values('coef', ascending=False).head(10)
print("Top 10 largest (coef, p-value):")
print(top10)

# 2) Top 10 smallest coefficients (most negative)
bottom10 = sig.sort_values('coef', ascending=True).head(10)
print("\nTop 10 smallest (coef, p-value):")
print(bottom10)

Top 10 largest (coef, p-value):
                             coef        pvalue
C(sale_time)[T.2022-04]  4.044620  3.658389e-82
C(sale_time)[T.2022-05]  3.879032  3.885068e-76
C(sale_time)[T.2022-02]  3.826707  7.975290e-73
C(sale_time)[T.2022-03]  3.792496  1.657966e-72
C(sale_time)[T.2022-07]  3.763332  7.470782e-73
C(sale_time)[T.2022-01]  3.685217  7.983281e-68
C(sale_time)[T.2022-06]  3.659550  1.416522e-68
C(sale_time)[T.2022-09]  3.620473  8.283856e-68
C(sale_time)[T.2022-08]  3.538571  8.051974e-65
C(sale_time)[T.2022-12]  3.469875  1.399640e-62

Top 10 smallest (coef, p-value):
                                    coef        pvalue
C(Clothes)[T.Service]          -0.106740  6.356016e-07
C(Hat)[T.Fez]                  -0.086860  1.553076e-10
C(Hat)[T.Ww2 Pilot Helm]       -0.085656  3.340829e-04
C(Clothes)[T.Bone Tee]         -0.084961  4.686366e-07
C(Clothes)[T.Sleeveless T]     -0.080597  6.357412e-07
C(Clothes)[T.Navy Striped Tee] -0.080581  3.043083e-08
C(Clothes)[T.Prom Dre

# THis is the Random Effect but does not make sense

In [21]:

# 1. Load the data
df = pd.read_csv('Panel_for_Model2.csv')

# 2. Drop unneeded identifier columns
df = df.drop(columns=[
    'buyer', 'seller', 'time_of_sale', 'event_count'
])

# 3. Convert sale_time to a month indicator
df['sale_time'] = pd.to_datetime(df['sale_time'], errors='coerce')
df['sale_month'] = df['sale_time'].dt.strftime('%Y-%m')
df = df.drop(columns=['sale_time'])
# Convert token_id early
df['token_id'] = df['token_id'].astype(str)


# 4. Winsorize and log‐transform the continuous variables
num_cols = [
    'price', 'buyer_tscount', 'buyer_act_period', 'buyer_total_value',
    'buyer_total_gasUsed', 'buyer_avg_gasPrice', 'buyer_avg_gasLimit',
    'buyer_rolling_avg_value_last10', 'buyer_rolling_std_value_last10',
    'seller_tscount', 'seller_act_period', 'seller_total_value',
    'seller_total_gasUsed', 'seller_avg_gasPrice', 'seller_avg_gasLimit',
    'seller_rolling_avg_value_last10', 'seller_rolling_std_value_last10',
    'total_offers', 'unique_makers_count', 'mean_offer_price',
    'std_offer_price', 'median_offer_price', 'highest_offer',
    'lowest_offer', 'duration_offer_days', 'rarity.rank'
]

for col in num_cols:
    # Check if column exists before processing
    if col in df.columns:
        # winsorize at 1%–99%
        df[col] = winsorize(df[col], limits=[0.01, 0.01])
        # log1p
        df[f'log_{col}'] = np.log1p(df[col])
        df = df.drop(columns=[col])

# 5. One‐hot encode categorical features (drop first level)
# Include sale_month here
cat_cols = ['sale_month', 'Background', 'Clothes', 'Earring', 'Eyes', 'Fur', 'Hat', 'Mouth']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 6. Set panel index (entity = token_id, time = sale_month_dummy)
# Since sale_month is now dummified, you cannot use it directly as the time index.
# PanelOLS requires a proper time index. You might need to rethink using RandomEffects
# if you intended to use the original 'sale_month' as the time dimension.
# If you want month fixed effects, the dummies are correct, but PanelOLS needs a time index.
# Perhaps you intended to use the original 'sale_time' converted to a period/timestamp?
# For now, let's assume you want to proceed without a time index for RandomEffects (which is unusual).
# If you need a time index for PanelOLS/RandomEffects, you must keep 'sale_month' or a time-based column.

# Let's revert to keeping sale_month for the index and dummifying only traits:
# --- Corrected Revised Order ---

# 1. Load the data
df = pd.read_csv('Panel_for_Model2.csv')

# 2. Drop unneeded identifier columns
df = df.drop(columns=[
    'buyer', 'seller', 'time_of_sale', 'event_count'
])

# 3. Convert sale_time to a month indicator
df['sale_time'] = pd.to_datetime(df['sale_time'], errors='coerce')
# Convert sale_time to month start timestamp for panel time index
df['sale_month'] = df['sale_time'].dt.to_period('M').dt.to_timestamp()
df = df.drop(columns=['sale_time'])  # Drop original datetime
# Convert token_id early
df['token_id'] = df['token_id'].astype(str)


# 4. Winsorize and log‐transform the continuous variables
num_cols = [
    'price', 'buyer_tscount', 'buyer_act_period', 'buyer_total_value',
    'buyer_total_gasUsed', 'buyer_avg_gasPrice', 'buyer_avg_gasLimit',
    'buyer_rolling_avg_value_last10', 'buyer_rolling_std_value_last10',
    'seller_tscount', 'seller_act_period', 'seller_total_value',
    'seller_total_gasUsed', 'seller_avg_gasPrice', 'seller_avg_gasLimit',
    'seller_rolling_avg_value_last10', 'seller_rolling_std_value_last10',
    'total_offers', 'unique_makers_count', 'mean_offer_price',
    'std_offer_price', 'median_offer_price', 'highest_offer',
    'lowest_offer', 'duration_offer_days', 'rarity.rank'
]

for col in num_cols:
     if col in df.columns:
        # Handle potential NaNs introduced by coercion before winsorize/log
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df = df.dropna(subset=[col]) # Or impute NaNs
        # winsorize at 1%–99%
        df[col] = winsorize(df[col], limits=[0.01, 0.01])
        # log1p
        df[f'log_{col}'] = np.log1p(df[col])
        df = df.drop(columns=[col])

# Drop rows where log_price might be NaN after transformations
df = df.dropna(subset=['log_price'])


# 5. One‐hot encode only TRAIT categorical features (drop first level)
trait_cols = ['Background', 'Clothes', 'Earring', 'Eyes', 'Fur', 'Hat', 'Mouth']
df = pd.get_dummies(df, columns=trait_cols, drop_first=True, dummy_na=False) # Avoid NA dummies unless intended

# 6. Set panel index (entity = token_id, time = sale_month_str)
# 6. Set panel index (entity = token_id, time = sale_month)
# Ensure sale_month exists and handle potential NaNs from date conversion
df = df.dropna(subset=['sale_month'])
df = df.set_index(['token_id', 'sale_month'])

# Ensure unique index by aggregating duplicates (take mean for numeric, first for bool)
if not df.index.is_unique:
    df = df.groupby(level=['token_id', 'sale_month']).agg(
        lambda x: x.mean() if x.dtype.kind in 'fi' else x.iloc[0]
    )

# 7. Prepare exogenous regressors & dependent variable
y = df['log_price']
exog_cols = [col for col in df.columns if col != 'log_price'] # Select all other columns
exog = df[exog_cols]
# Add constant AFTER selecting columns
exog = sm.add_constant(exog, has_constant='add') # Add constant safely

# Drop rows with any NaNs in exog before fitting
exog = exog.dropna()
y = y.loc[exog.index] # Align y with cleaned exog


# 8. Fit Random Effects model
# Ensure no NaN/inf in y or exog
if y.isnull().any() or np.isinf(y).any():
    print("NaN/inf found in y")
if exog.isnull().any().any() or np.isinf(exog).any().any():
    print("NaN/inf found in exog")
    # print(exog.isnull().sum()) # Find columns with NaNs

# Ensure y and exog are aligned and finite
# Take intersection of indices and drop any rows with NaN or inf in either y or exog
common_index = y.index.intersection(exog.index)
y_aligned = y.loc[common_index]
exog_aligned = exog.loc[common_index]

# Build mask using the aligned index only
mask_y = y_aligned.notnull() & np.isfinite(y_aligned)
mask_exog = exog_aligned.notnull().all(axis=1) & np.isfinite(exog_aligned).all(axis=1)
mask = mask_y & mask_exog

# Ensure mask is aligned to the index of y_aligned and exog_aligned
mask = mask.reindex(y_aligned.index, fill_value=False)

# Apply mask only to the aligned objects
y = y_aligned[mask]
exog = exog_aligned[mask]


re_mod = RandomEffects(y, exog)
re_res = re_mod.fit(cov_type='robust')

# 9. Print summary
print(re_res.summary)


                        RandomEffects Estimation Summary                        
Dep. Variable:              log_price   R-squared:                        0.5112
Estimator:              RandomEffects   R-squared (Between):              0.5667
No. Observations:               13797   R-squared (Within):               0.3977
Date:                Mon, Apr 28 2025   R-squared (Overall):              0.5187
Time:                        02:22:03   Log-likelihood                -1.892e+04
Cov. Estimator:                Robust                                           
                                        F-statistic:                      75.281
Entities:                        9254   P-value                           0.0000
Avg Obs:                       1.4909   Distribution:               F(189,13607)
Min Obs:                       1.0000                                           
Max Obs:                       2.0000   F-statistic (robust):             107.32
                            

# RdigeCV

In [49]:
# correlation matrix between the features without sale_time and token_id
df_features = df.drop(columns=['sale_time', 'token_id'])
corr_matrix = df_features.corr()
print(corr_matrix)


                         log_buyer_tscount  log_buyer_act_period  \
log_buyer_tscount                 1.000000              0.999957   
log_buyer_act_period              0.999957              1.000000   
log_buyer_total_value             0.992841              0.992634   
log_buyer_total_gasUsed           0.999964              0.999994   
log_buyer_avg_gasPrice            0.999964              0.999994   
...                                    ...                   ...   
Mouth_Phoneme Wah                -0.004962             -0.004967   
Mouth_Rage                        0.011904              0.011898   
Mouth_Small Grin                 -0.000195             -0.000201   
Mouth_Tongue Out                 -0.008932             -0.008937   
log_price                         0.771680              0.771712   

                         log_buyer_total_value  log_buyer_total_gasUsed  \
log_buyer_tscount                     0.992841                 0.999964   
log_buyer_act_period             

In [50]:
# print the correlations that are greater than 0.5 or less than -0.5
high_corr = corr_matrix[(corr_matrix > 0.5) | (corr_matrix < -0.5)]
print(high_corr)

                         log_buyer_tscount  log_buyer_act_period  \
log_buyer_tscount                 1.000000              0.999957   
log_buyer_act_period              0.999957              1.000000   
log_buyer_total_value             0.992841              0.992634   
log_buyer_total_gasUsed           0.999964              0.999994   
log_buyer_avg_gasPrice            0.999964              0.999994   
...                                    ...                   ...   
Mouth_Phoneme Wah                      NaN                   NaN   
Mouth_Rage                             NaN                   NaN   
Mouth_Small Grin                       NaN                   NaN   
Mouth_Tongue Out                       NaN                   NaN   
log_price                         0.771680              0.771712   

                         log_buyer_total_value  log_buyer_total_gasUsed  \
log_buyer_tscount                     0.992841                 0.999964   
log_buyer_act_period             

In [72]:
df

Unnamed: 0,token_id,sale_time,log_buyer_tscount,log_buyer_act_period,log_buyer_total_value,log_buyer_total_gasUsed,log_buyer_avg_gasPrice,log_buyer_avg_gasLimit,log_seller_tscount,log_seller_act_period,...,log_lowest_offer,log_duration_offer_days,Background,Clothes,Earring,Eyes,Fur,Hat,Mouth,log_price
0,1457,2023-12-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Yellow,Vietnam Jacket,0,Bored,White,S&m Hat,Phoneme Wah,1.000000
1,1810,2024-12-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.905000e-11,1.0,Aquamarine,Black Holes T,0,Eyepatch,Red,Fisherman's Hat,Jovial,1.000000
2,3978,2023-09-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Army Green,Sleeveless Logo T,0,Bored,Dark Brown,Bayc Hat Black,Bored,1.000000
3,4504,2024-02-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,9.950331e-03,0.0,Aquamarine,Lab Coat,Silver Hoop,Bloodshot,Brown,Irish Boho,Bored,1.000000
4,4423,2021-05-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Orange,0,Silver Hoop,Sleepy,Cream,Baby's Bonnet,Bored Cigarette,0.405465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,4562,2024-11-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.987542e-03,1.0,Orange,Hawaiian,0,Blindfold,Robot,Cowboy Hat,Bored Pipe,1.000000
18504,662,2021-08-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Army Green,Sailor Shirt,Silver Hoop,Sunglasses,Tan,0,Bored Unshaven,1.000000
18505,6616,2023-02-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Yellow,Black Holes T,0,3d,Brown,Bayc Flipped Brim,Small Grin,1.000000
18506,5566,2021-05-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.000000e+00,0.0,Purple,0,Gold Stud,Laser Eyes,Robot,Faux Hawk,Bored,1.000000
