<a href="https://colab.research.google.com/github/GriPet12/memcoin-graduation/blob/main/memcoin_graduation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [6]:
!pip install catboost lightgbm xgboost

Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (318.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.5 xgboost-3.0.1


In [7]:
import pandas as pd
import numpy as np
import glob
import os
import gc
from tqdm.auto import tqdm

# Modeling & Evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
from sklearn.metrics import log_loss

# Configuration

In [8]:
DATA_PATH = '/content/drive/MyDrive/memcoin-graduation/pump-fun-graduation-february-2025'
CHUNK_PATTERN = os.path.join(DATA_PATH, 'chunk*.csv')
TRAIN_FILE = os.path.join(DATA_PATH, 'train.csv')
TEST_FILE = os.path.join(DATA_PATH, 'test_unlabeled.csv')
DUNE_INFO_FILE = os.path.join(DATA_PATH, 'dune_token_info.csv')
ONCHAIN_INFO_FILE = os.path.join(DATA_PATH, 'token_info_onchain_divers.csv')
SUBMISSION_FILE = 'submission.csv'

TARGET = 'has_graduated'
MINT_ID = 'mint'
BLOCK_LIMIT = 100 # Only use data from first 100 blocks post-mint
N_SPLITS = 5 # Number of folds for cross-validation
RANDOM_SEED = 42

# Load Data

In [9]:
print("Loading data...")
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
dune_info_df = pd.read_csv(DUNE_INFO_FILE)
onchain_info_df = pd.read_csv(ONCHAIN_INFO_FILE)

train_df['is_train'] = 1
test_df['is_train'] = 0
combined_df = pd.concat([train_df, test_df], ignore_index=True)

all_chunk_files = glob.glob(CHUNK_PATTERN)
print(f"Found {len(all_chunk_files)} chunk files.")

chunk_list = []
for f in tqdm(all_chunk_files, desc="Loading chunks"):
    try:
        chunk_list.append(pd.read_csv(f))
    except Exception as e:
        print(f"Error loading {f}: {e}")
if not chunk_list:
    raise ValueError("No chunk files loaded. Check CHUNK_PATTERN and file existence.")

transactions_df = pd.concat(chunk_list, ignore_index=True)

Loading data...


  onchain_info_df = pd.read_csv(ONCHAIN_INFO_FILE)


Found 41 chunk files.


Loading chunks:   0%|          | 0/41 [00:00<?, ?it/s]

In [10]:
transactions_df.head()

Unnamed: 0,block_time,slot,tx_idx,signing_wallet,direction,base_coin,base_coin_amount,quote_coin_amount,virtual_token_balance_after,virtual_sol_balance_after,signature,provided_gas_fee,provided_gas_limit,fee,consumed_gas
0,2025-02-01 16:00:00,317876496,1897,EXoaGQc1taATjsXVPXhnVYddW4KiM1uQRhrYDdfi1x7b,buy,Ab2voNJxp9xM2sdoF6JRJV8dtZ6hGm8yMSt3xAMpump,5208861189189,150740503,1052069532604495,30596837025,3Td5mZpy63TNuyHncgpJFvPudbU3fKcRaeTbqTRsdSLKnN...,3982833,200862,805000,161488
1,2025-02-01 16:00:00,317876496,1794,9Ypu1cMva6dE6k9Zk4aSSmSgJvMmJLTWXuGQhTYqt8mx,buy,BmTDA5HqcemLkEgpyK25sDhbvk652CTXjdWEa8fLpump,1785357737104,50000000,1071214642262896,30050000000,3HHvJsNKWg6epToaZUouqDJdkysiJGKeBkWnhg7sPyWpFY...,9475209,194994,1857609,194994
2,2025-02-01 16:00:00,317876496,1880,3njxeVx5TjDYD27C1YsZW2JQzgmoYeATambcbw7Xn1ft,sell,FZ8wX1RAwV72gniwc9quiZSXHnrECQwoCxAXCWRipump,721068391933,22610289,1013561711601751,31759289915,2rfHemwRWv9t2xuY2umAq2aQrx8Gn73g6tAEDBTE2yrzcf...,4624039,162000,754095,78540
3,2025-02-01 16:00:01,317876499,1350,DjZ1Cpxp6uKvYHU678QkjFj8XKfUwAmCtxUMG5QuBdJT,buy,BmTDA5HqcemLkEgpyK25sDhbvk652CTXjdWEa8fLpump,115855160976852,3644123135,955359481286044,33694123135,Ky5DURUWgB7N3NfNAx5jf7a23pecQAwxbcwuwaWE2MPk7R...,170888000,80000,13676040,62135
4,2025-02-01 16:00:01,317876500,2389,6WgXuHPo9xWu1Mzt8hULYWFwpyh1WwhByPYjsF23h41A,buy,4FJwryCAMMePNeWw9LTBXXfXABdkKcAxkTT6h9pdpump,67062499999999,2000000000,1005937500000001,32000000000,4rSq4SK7a6zb2hNV6x89DfwZ9XFAyy4sp1g9YQfuG8A3Ep...,100000,500000,60000,201798


In [11]:
transactions_df['block_time'] = pd.to_datetime(transactions_df['block_time'], errors='coerce')
transactions_df['slot'] = pd.to_numeric(transactions_df['slot'], errors='coerce')
combined_df['slot_min'] = pd.to_numeric(combined_df['slot_min'], errors='coerce')

# Data Merging and Preprocessing

In [12]:
print("Merging data...")

transactions_df = pd.merge(
    transactions_df,
    combined_df[[MINT_ID, 'slot_min']],
    left_on='base_coin',
    right_on=MINT_ID,
    how='left'
)

transactions_df = transactions_df[
    transactions_df['slot'] <= transactions_df['slot_min'] + BLOCK_LIMIT
    ]
transactions_df.columns

Merging data...


Index(['block_time', 'slot', 'tx_idx', 'signing_wallet', 'direction',
       'base_coin', 'base_coin_amount', 'quote_coin_amount',
       'virtual_token_balance_after', 'virtual_sol_balance_after', 'signature',
       'provided_gas_fee', 'provided_gas_limit', 'fee', 'consumed_gas', 'mint',
       'slot_min'],
      dtype='object')

In [13]:
dune_info_df = dune_info_df.rename(columns={'token_mint_address': MINT_ID})
dune_info_df = dune_info_df[[MINT_ID, 'decimals', 'name', 'symbol', 'token_uri', 'created_at', 'init_tx']].drop_duplicates(subset=[MINT_ID], keep='first')
dune_info_df['created_at'] = pd.to_datetime(dune_info_df['created_at'], errors='coerce')

onchain_info_df = onchain_info_df.rename(columns={'mint': MINT_ID})
onchain_info_df = onchain_info_df[[MINT_ID, 'creator', 'bundle_size', 'gas_used']].drop_duplicates(subset=[MINT_ID], keep='first')
onchain_info_df['bundle_size'] = pd.to_numeric(onchain_info_df['bundle_size'], errors='coerce').fillna(0)
onchain_info_df['gas_used'] = pd.to_numeric(onchain_info_df['gas_used'], errors='coerce')

dune_info_df.columns, onchain_info_df.columns

(Index(['mint', 'decimals', 'name', 'symbol', 'token_uri', 'created_at',
        'init_tx'],
       dtype='object'),
 Index(['mint', 'creator', 'bundle_size', 'gas_used'], dtype='object'))

In [14]:
combined_df = pd.merge(combined_df, dune_info_df, on=MINT_ID, how='left')
combined_df = pd.merge(combined_df, onchain_info_df, on=MINT_ID, how='left')
combined_df.columns

Index(['Unnamed: 0', 'mint', 'slot_min', 'slot_graduated', 'has_graduated',
       'is_valid', 'is_train', 'decimals', 'name', 'symbol', 'token_uri',
       'created_at', 'init_tx', 'creator', 'bundle_size', 'gas_used'],
      dtype='object')