# Import

In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [2]:
import pandas as pd
import numpy as np
import glob
import os
import gc
from tqdm.auto import tqdm

# Modeling & Evaluation
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import log_loss

# Configuration

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_PATH = '/content/drive/MyDrive/memcoin-graduation/pump-fun-graduation-february-2025'
CHUNK_PATTERN = os.path.join(DATA_PATH, 'chunk*.csv')
TRAIN_FILE = os.path.join(DATA_PATH, 'train.csv')
TEST_FILE = os.path.join(DATA_PATH, 'test_unlabeled.csv')
DUNE_INFO_FILE = os.path.join(DATA_PATH, 'dune_token_info.csv')
ONCHAIN_INFO_FILE = os.path.join(DATA_PATH, 'token_info_onchain_divers.csv')
SUBMISSION_FILE = 'submission.csv'

TARGET = 'has_graduated'
MINT_ID = 'mint'
BLOCK_LIMIT = 50
N_SPLITS = 3
RANDOM_SEED = 42

# Load Data

In [5]:
print("Loading data...")
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
dune_info_df = pd.read_csv(DUNE_INFO_FILE)
onchain_info_df = pd.read_csv(ONCHAIN_INFO_FILE)

train_df['is_train'] = 1
test_df['is_train'] = 0
combined_df = pd.concat([train_df, test_df], ignore_index=True)

all_chunk_files = glob.glob(CHUNK_PATTERN)
print(f"Found {len(all_chunk_files)} chunk files.")

chunk_list = []
for f in tqdm(all_chunk_files[:5], desc="Loading first 5 chunks"):
    try:
        chunk_list.append(pd.read_csv(f))
    except Exception as e:
        print(f"Error loading {f}: {e}")

transactions_df = pd.concat(chunk_list, ignore_index=True)

Loading data...


  onchain_info_df = pd.read_csv(ONCHAIN_INFO_FILE)


Found 41 chunk files.


Loading first 5 chunks:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
transactions_df.head()

Unnamed: 0,block_time,slot,tx_idx,signing_wallet,direction,base_coin,base_coin_amount,quote_coin_amount,virtual_token_balance_after,virtual_sol_balance_after,signature,provided_gas_fee,provided_gas_limit,fee,consumed_gas
0,2025-02-01 16:00:00,317876496,1897,EXoaGQc1taATjsXVPXhnVYddW4KiM1uQRhrYDdfi1x7b,buy,Ab2voNJxp9xM2sdoF6JRJV8dtZ6hGm8yMSt3xAMpump,5208861189189,150740503,1052069532604495,30596837025,3Td5mZpy63TNuyHncgpJFvPudbU3fKcRaeTbqTRsdSLKnN...,3982833,200862,805000,161488
1,2025-02-01 16:00:00,317876496,1794,9Ypu1cMva6dE6k9Zk4aSSmSgJvMmJLTWXuGQhTYqt8mx,buy,BmTDA5HqcemLkEgpyK25sDhbvk652CTXjdWEa8fLpump,1785357737104,50000000,1071214642262896,30050000000,3HHvJsNKWg6epToaZUouqDJdkysiJGKeBkWnhg7sPyWpFY...,9475209,194994,1857609,194994
2,2025-02-01 16:00:00,317876496,1880,3njxeVx5TjDYD27C1YsZW2JQzgmoYeATambcbw7Xn1ft,sell,FZ8wX1RAwV72gniwc9quiZSXHnrECQwoCxAXCWRipump,721068391933,22610289,1013561711601751,31759289915,2rfHemwRWv9t2xuY2umAq2aQrx8Gn73g6tAEDBTE2yrzcf...,4624039,162000,754095,78540
3,2025-02-01 16:00:01,317876499,1350,DjZ1Cpxp6uKvYHU678QkjFj8XKfUwAmCtxUMG5QuBdJT,buy,BmTDA5HqcemLkEgpyK25sDhbvk652CTXjdWEa8fLpump,115855160976852,3644123135,955359481286044,33694123135,Ky5DURUWgB7N3NfNAx5jf7a23pecQAwxbcwuwaWE2MPk7R...,170888000,80000,13676040,62135
4,2025-02-01 16:00:01,317876500,2389,6WgXuHPo9xWu1Mzt8hULYWFwpyh1WwhByPYjsF23h41A,buy,4FJwryCAMMePNeWw9LTBXXfXABdkKcAxkTT6h9pdpump,67062499999999,2000000000,1005937500000001,32000000000,4rSq4SK7a6zb2hNV6x89DfwZ9XFAyy4sp1g9YQfuG8A3Ep...,100000,500000,60000,201798


In [7]:
transactions_df['block_time'] = pd.to_datetime(transactions_df['block_time'], errors='coerce')
transactions_df['slot'] = pd.to_numeric(transactions_df['slot'], errors='coerce')
combined_df['slot_min'] = pd.to_numeric(combined_df['slot_min'], errors='coerce')

# Data Merging and Preprocessing

In [8]:
print("Merging data...")
transactions_df = pd.merge(
    transactions_df,
    combined_df[[MINT_ID, 'slot_min']],
    left_on='base_coin',
    right_on=MINT_ID,
    how='left'
)

transactions_df = transactions_df[
    transactions_df['slot'] <= transactions_df['slot_min'] + BLOCK_LIMIT
]

Merging data...


In [9]:
dune_info_df = dune_info_df.rename(columns={'token_mint_address': MINT_ID})
dune_info_df = dune_info_df[[MINT_ID, 'decimals']].drop_duplicates(subset=[MINT_ID], keep='first')

onchain_info_df = onchain_info_df.rename(columns={'mint': MINT_ID})
onchain_info_df = onchain_info_df[[MINT_ID, 'bundle_size']].drop_duplicates(subset=[MINT_ID], keep='first')
onchain_info_df['bundle_size'] = pd.to_numeric(onchain_info_df['bundle_size'], errors='coerce').fillna(0)

In [10]:
combined_df = pd.merge(combined_df, dune_info_df, on=MINT_ID, how='left')
combined_df = pd.merge(combined_df, onchain_info_df, on=MINT_ID, how='left')
combined_df.columns

Index(['Unnamed: 0', 'mint', 'slot_min', 'slot_graduated', 'has_graduated',
       'is_valid', 'is_train', 'decimals', 'bundle_size'],
      dtype='object')

# Exploratory Data Analysis

In [11]:
print("Basic EDA:")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Transactions shape (first {BLOCK_LIMIT} blocks): {transactions_df.shape}")
print(f"Combined shape before features: {combined_df.shape}")

print("\nMissing values in combined metadata:")
print(combined_df.isnull().sum() / len(combined_df))

Basic EDA:
Train shape: (639557, 7)
Test shape: (478832, 5)
Transactions shape (first 50 blocks): (1279348, 17)
Combined shape before features: (1118389, 9)

Missing values in combined metadata:
Unnamed: 0        0.000000
mint              0.000000
slot_min          0.000000
slot_graduated    0.993380
has_graduated     0.428144
is_valid          0.000000
is_train          0.000000
decimals          0.208081
bundle_size       0.557353
dtype: float64


In [12]:
print("\nTarget Distribution:")
print(combined_df[TARGET].value_counts(normalize=True))


Target Distribution:
has_graduated
False    0.988423
True     0.011577
Name: proportion, dtype: float64


In [13]:
print("\nTransaction Data Info:")
print(f"Unique tokens: {transactions_df['base_coin'].nunique()}")
print(f"Date range: {transactions_df['block_time'].min()} to {transactions_df['block_time'].max()}")


Transaction Data Info:
Unique tokens: 141770
Date range: 2025-02-01 16:00:00 to 2025-02-09 20:09:24


# Feature Engineering

In [14]:
print("Starting Feature Engineering...")

grouped_tx = transactions_df.groupby('base_coin')

basic_agg = grouped_tx.agg({
    'tx_idx': 'count',
    'signing_wallet': 'nunique',
    'quote_coin_amount': ['sum', 'mean'],
    'base_coin_amount': ['sum', 'mean'],
}).reset_index()

basic_agg.columns = [MINT_ID] + ['_'.join(col).strip() if col[1] else col[0] for col in basic_agg.columns[1:]]

Starting Feature Engineering...


In [15]:
buy_tx = transactions_df[transactions_df['direction'] == 'buy']
sell_tx = transactions_df[transactions_df['direction'] == 'sell']

buy_counts = buy_tx.groupby('base_coin')['tx_idx'].count().reset_index()
buy_counts.columns = [MINT_ID, 'buy_count']

sell_counts = sell_tx.groupby('base_coin')['tx_idx'].count().reset_index()
sell_counts.columns = [MINT_ID, 'sell_count']

In [16]:
print("Merging aggregated features...")
combined_df = pd.merge(combined_df, basic_agg, on=MINT_ID, how='left')
combined_df = pd.merge(combined_df, buy_counts, on=MINT_ID, how='left')
combined_df = pd.merge(combined_df, sell_counts, on=MINT_ID, how='left')

Merging aggregated features...


In [17]:
combined_df['buy_sell_ratio'] = combined_df['buy_count'] / (combined_df['sell_count'] + 1e-6)

# Final Feature Selection

In [18]:
print("Selecting features...")
features_to_drop = [
    MINT_ID, TARGET, 'slot_graduated', 'is_train', 'slot_min'
]

features = [col for col in combined_df.columns if col not in features_to_drop and combined_df[col].dtype in ['int64', 'float64']]
features = [f for f in features if not combined_df[f].isnull().all()]

def clean_feature_names(df):
    df = df.copy()
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '_', regex=True)
    df.columns = df.columns.str.replace('__+', '_', regex=True)
    df.columns = df.columns.str.strip('_')
    return df

print(f"Using {len(features)} features")

train_processed = combined_df[combined_df['is_train'] == 1].reset_index(drop=True)
test_processed = combined_df[combined_df['is_train'] == 0].reset_index(drop=True)

X = train_processed[features].fillna(0)
y = train_processed[TARGET].astype(int)
X_test = test_processed[features].fillna(0)

X = clean_feature_names(X)
X_test = clean_feature_names(X_test)

Selecting features...
Using 12 features


In [19]:
print("Training LightGBM model...")
lgb_oof_preds = np.zeros(len(X))
lgb_test_preds = np.zeros(len(X_test))

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        objective='binary',
        n_estimators=500,
        learning_rate=0.05,
        random_state=RANDOM_SEED + fold,
        verbose=-1
    )

    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(50, verbose=False)])

    val_preds = model.predict_proba(X_val)[:, 1]
    lgb_oof_preds[val_idx] = val_preds
    lgb_test_preds += model.predict_proba(X_test)[:, 1] / N_SPLITS

    print(f"Fold {fold+1} OOF LogLoss: {log_loss(y_val, val_preds)}")

overall_oof_logloss = log_loss(y, lgb_oof_preds)
print(f"\nOverall OOF LogLoss: {overall_oof_logloss}")


Training LightGBM model...
--- Fold 1/3 ---
Fold 1 OOF LogLoss: 0.05954657314903961
--- Fold 2/3 ---
Fold 2 OOF LogLoss: 0.059220084967338354
--- Fold 3/3 ---
Fold 3 OOF LogLoss: 0.059505805465997146

Overall OOF LogLoss: 0.05942415439979043


In [20]:
submission_df = pd.DataFrame({
    MINT_ID: test_processed[MINT_ID],
    TARGET: np.clip(lgb_test_preds, 0.0001, 0.9999)
})

submission_df.to_csv(SUBMISSION_FILE, index=False)
print(f"Submission saved to: {SUBMISSION_FILE}")
print("Script finished!")

Submission saved to: submission.csv
Script finished!
