In [6]:
pip install tabpfn

Collecting torch>=1.9.0
  Using cached torch-2.2.2-cp38-none-macosx_10_9_x86_64.whl (150.6 MB)
Installing collected packages: torch
Successfully installed torch-2.2.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [7]:
import torch
from tabpfn import TabPFNClassifier

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load the model
clf = TabPFNClassifier(device=device)

Using device: cpu


In [8]:
df=pd.read_csv('star_classification.csv')
#feature selection
df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','spec_obj_ID','plate','MJD','fiber_ID'], axis='columns')

#cleaning the data
df = df[(df['u'] >= 0)]
#df = df[(df['g'] >= 0)]
#df = df[(df['z'] >= 0)]

print(df.describe())
print(df.head(10))

                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean      22.080679     20.631583     19.645777     19.084865     18.768988   
std        2.251068      2.037384      1.854763      1.757900      1.765982   
min       10.996230     10.498200      9.822070      9.469903      9.612333   
25%       20.352410     18.965240     18.135795     17.732280     17.460830   
50%       22.179140     21.099930     20.125310     19.405150     19.004600   
75%       23.687480     22.123775     21.044790     20.396510     19.921120   
max       32.781390     31.602240     29.571860     32.141470     29.383740   

           redshift  
count  99999.000000  
mean       0.576667  
std        0.730709  
min       -0.009971  
25%        0.054522  
50%        0.424176  
75%        0.704172  
max        7.011245  
          u         g         r         i         z   class  redshift
0  23.87882  22.2753

In [9]:
#turning the Class values into categorical data
df=df.rename(columns = {'class':'Class'})
df.Class = df.Class.astype('category')
cat_columns = df.select_dtypes(['category']).columns
cat_columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)


In [10]:
# raw data GradientBoostingClassifier
## train test split
df_sampled = df.sample(n=1000, random_state=42)

X = df_sampled.drop('Class',axis='columns')
y = np.array(df_sampled['Class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_test[:10])

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

# Train the TabPFN model 
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[2 0 1 0 0 1 0 2 2 0]




0.98
[[110   1   0]
 [  1  43   0]
 [  2   0  43]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       111
           1       0.98      0.98      0.98        44
           2       1.00      0.96      0.98        45

    accuracy                           0.98       200
   macro avg       0.98      0.97      0.98       200
weighted avg       0.98      0.98      0.98       200



In [18]:
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.version.cuda)         # Check installed CUDA version
print(torch.__version__)          # Check PyTorch version

False
None
2.2.2


  and should_run_async(code)


In [27]:
df_sampled = df.sample(n=10000, random_state=42).reset_index(drop=True)

X_sampled = df_sampled.drop('Class',axis='columns')
y_sampled = np.array(df_sampled['Class'])

batch_size = 1000
num_batches = len(X_sampled) // batch_size

# Initialize lists to store results
all_y_true = []
all_y_pred = []

# Loop through batches
for i in range(num_batches):
    print(f"Processing batch {i+1}/{num_batches}...")

    # Get batch indices
    start_idx = i * batch_size
    end_idx = start_idx + batch_size

    # Extract batch
    X_batch = X_sampled[start_idx:end_idx]
    y_batch = y_sampled[start_idx:end_idx]

    # Train-test split for this batch
    X_train, X_test, y_train, y_test = train_test_split(X_batch, y_batch, test_size=0.2, random_state=42)

    # Train TabPFN
    clf.fit(X_train, y_train)

    # Predict
    y_pred = clf.predict(X_test)

    # Store results
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    
print(accuracy_score(all_y_true, all_y_pred))
print(metrics.confusion_matrix(all_y_pred, all_y_true))
print(classification_report(all_y_pred, all_y_true))

  and should_run_async(code)


Processing batch 1/10...
Processing batch 2/10...




Processing batch 3/10...




Processing batch 4/10...




Processing batch 5/10...




Processing batch 6/10...




Processing batch 7/10...




Processing batch 8/10...




Processing batch 9/10...




Processing batch 10/10...




0.961
[[1128   37    5]
 [  15  376    0]
 [  19    2  418]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1170
           1       0.91      0.96      0.93       391
           2       0.99      0.95      0.97       439

    accuracy                           0.96      2000
   macro avg       0.95      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [11]:
#normalizing

X = df.drop('Class', axis=1)
y = df['Class']

#normalize features
scaler = preprocessing.MinMaxScaler()  #default range [0, 1]
X_normalized = scaler.fit_transform(X)

#put everything together in a pandas dataframe, list
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)
df_normalized = pd.concat([X_normalized, y.reset_index(drop=True)], axis=1)

print(df_normalized.head(10))
print(df_normalized.describe())

          u         g         r         i         z  redshift  Class
0  0.591347  0.558050  0.535344  0.427665  0.464377  0.091831      0
1  0.632603  0.584423  0.646203  0.515986  0.607035  0.112389      0
2  0.654888  0.576463  0.546218  0.435729  0.472194  0.093170      0
3  0.511384  0.629186  0.596946  0.486717  0.487460  0.134210      0
4  0.387463  0.335579  0.337999  0.287021  0.300043  0.017959      0
5  0.573420  0.608393  0.582279  0.475761  0.502398  0.204328      1
6  0.480763  0.505971  0.562346  0.491292  0.546921  0.084946      1
7  0.516570  0.546034  0.532623  0.441877  0.467223  0.069358      0
8  0.615402  0.561906  0.546246  0.440860  0.472712  0.095423      0
9  0.493476  0.451891  0.473598  0.412337  0.457321  0.001419      2
                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean       0.508807      0.480163      0.497408      0.424098      0.463126   
std 

  and should_run_async(code)
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they. (Deprecated NumPy 1.22)
  return np.percentile(values, q, axis=axis, interpolation=interpolation)
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they. (Deprecated NumPy 1.22)
  return np.percentile(values, q, axis=axis, interpolation=interpolation)


In [12]:
df_sampled = df_normalized.sample(n=10000, random_state=42).reset_index(drop=True)

X_sampled = df_sampled.drop('Class',axis='columns')
y_sampled = np.array(df_sampled['Class'])

batch_size = 1000
num_batches = len(X_sampled) // batch_size

# Initialize lists to store results
all_y_true = []
all_y_pred = []

# Loop through batches
for i in range(num_batches):
    print(f"Processing batch {i+1}/{num_batches}...")

    # Get batch indices
    start_idx = i * batch_size
    end_idx = start_idx + batch_size

    # Extract batch
    X_batch = X_sampled[start_idx:end_idx]
    y_batch = y_sampled[start_idx:end_idx]

    # Train-test split for this batch
    X_train, X_test, y_train, y_test = train_test_split(X_batch, y_batch, test_size=0.2, random_state=42)

    # Train TabPFN
    clf.fit(X_train, y_train)

    # Predict
    y_pred = clf.predict(X_test)

    # Store results
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    
print(accuracy_score(all_y_true, all_y_pred))
print(metrics.confusion_matrix(all_y_pred, all_y_true))
print(classification_report(all_y_pred, all_y_true))

  and should_run_async(code)


Processing batch 1/10...
Processing batch 2/10...




Processing batch 3/10...




Processing batch 4/10...




Processing batch 5/10...




Processing batch 6/10...




Processing batch 7/10...




Processing batch 8/10...




Processing batch 9/10...




Processing batch 10/10...




0.961
[[1128   37    5]
 [  15  376    0]
 [  19    2  418]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1170
           1       0.91      0.96      0.93       391
           2       0.99      0.95      0.97       439

    accuracy                           0.96      2000
   macro avg       0.95      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

