In [1]:
import pandas as pd
import os

data_dir = "/home/janani/Documents/pivony/multilabel-BERT/notebooks"
chunk_size = 10000  # Adjust this based on memory availability

# Create an empty list to store chunks
chunks = []

# Read the TSV file in chunks
for chunk in pd.read_csv(os.path.join(data_dir, "test.tsv"), sep='\t', chunksize=chunk_size):
    chunks.append(chunk)

# Concatenate all the chunks into a single DataFrame
df_data = pd.concat(chunks, ignore_index=True)



In [2]:
df_data

Unnamed: 0,astro-ph,cond-mat,cs,hep-ph,hep-th,math,physics,quant-ph,combined
0,0,0,1,0,0,0,0,0,HPIM-DM: a fast and reliable dense-mode multic...
1,0,1,0,0,0,0,0,0,Optical $N$-invariant of graphene's viscous Ha...
2,0,0,0,0,0,1,0,0,Splitting theorems for Poisson and related str...
3,1,0,0,0,0,0,0,0,Overview on spectral line source finding and v...
4,0,0,0,1,0,0,0,0,On the lepton-nucleon neutral and charged curr...
...,...,...,...,...,...,...,...,...,...
1495,0,0,0,0,0,1,0,0,A Flexible Procedure for Mixture Proportion Es...
1496,1,0,0,0,0,0,0,0,Reverberation by a Relativistic Accretion Disk...
1497,0,0,0,0,0,1,0,0,A New Preconditioner for the GeneRank Problem....
1498,0,0,0,0,0,0,1,0,Threadlike bundle of tubules running inside bl...


In [14]:
# Separate features and labels
X = df_data['combined']  
y = df_data.drop(columns=['combined'])

In [15]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
def encode_texts(texts, tokenizer, model, max_length=128, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', max_length=max_length, truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  
        all_embeddings.append(embeddings)
    return torch.cat(all_embeddings)

# Encode the text data
X_encoded = encode_texts(X.tolist(), tokenizer, model)
X_encoded = X_encoded.numpy()  # Convert to numpy array





In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [18]:
from tpot import TPOTClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize TPOT with a reduced configuration dictionary and parallelization
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    random_state=42,
    n_jobs=-1,  # Use all available CPU cores
    early_stop=5,  # Stop if no improvement after 5 generations
    config_dict={
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 50],
            'max_features': [0.5, 0.7],
            'max_depth': [None, 10]
        },
        'sklearn.linear_model.LogisticRegression': {
            'C': [0.1, 1.0],
            'solver': ['liblinear']
        }
    }
)


In [20]:
from sklearn.metrics import accuracy_score, hamming_loss
# Wrap TPOT in a MultiOutputClassifier
multi_target_tpot = MultiOutputClassifier(tpot)

# Fit the model on the subset
multi_target_tpot.fit(X_train, y_train)

# Predict on the test set
y_pred = multi_target_tpot.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Calculate Hamming loss
hamming = hamming_loss(y_test, y_pred)
print(f'Hamming Loss: {hamming}')


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9658333333333333

Generation 2 - Current best internal CV score: 0.9658333333333333

Generation 3 - Current best internal CV score: 0.9658333333333333

Generation 4 - Current best internal CV score: 0.9674999999999999

Generation 5 - Current best internal CV score: 0.9674999999999999

Best pipeline: LogisticRegression(CombineDFs(CombineDFs(input_matrix, input_matrix), input_matrix), C=1.0, solver=liblinear)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9216666666666666

Generation 2 - Current best internal CV score: 0.9216666666666666

Generation 3 - Current best internal CV score: 0.9233333333333335

Generation 4 - Current best internal CV score: 0.925

Generation 5 - Current best internal CV score: 0.925

Best pipeline: RandomForestClassifier(LogisticRegression(input_matrix, C=1.0, solver=liblinear), max_depth=None, max_features=0.5, n_estimators=10)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9383333333333332

Generation 2 - Current best internal CV score: 0.9383333333333332

Generation 3 - Current best internal CV score: 0.9383333333333332

Generation 4 - Current best internal CV score: 0.9383333333333332

Generation 5 - Current best internal CV score: 0.9383333333333332

Best pipeline: LogisticRegression(input_matrix, C=0.1, solver=liblinear)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.95

Generation 2 - Current best internal CV score: 0.95

Generation 3 - Current best internal CV score: 0.95

Generation 4 - Current best internal CV score: 0.95

Generation 5 - Current best internal CV score: 0.95

Best pipeline: LogisticRegression(CombineDFs(input_matrix, input_matrix), C=1.0, solver=liblinear)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9358333333333334

Generation 2 - Current best internal CV score: 0.9358333333333334

Generation 3 - Current best internal CV score: 0.9358333333333334

Generation 4 - Current best internal CV score: 0.9358333333333334

Generation 5 - Current best internal CV score: 0.9358333333333334

Best pipeline: LogisticRegression(CombineDFs(input_matrix, input_matrix), C=0.1, solver=liblinear)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9024999999999999

Generation 2 - Current best internal CV score: 0.9033333333333333

Generation 3 - Current best internal CV score: 0.9033333333333333

Generation 4 - Current best internal CV score: 0.9033333333333333

Generation 5 - Current best internal CV score: 0.9033333333333333

Best pipeline: RandomForestClassifier(LogisticRegression(input_matrix, C=0.1, solver=liblinear), max_depth=None, max_features=0.7, n_estimators=50)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9025000000000001

Generation 2 - Current best internal CV score: 0.9025000000000001

Generation 3 - Current best internal CV score: 0.9025000000000001

Generation 4 - Current best internal CV score: 0.9025000000000001

Generation 5 - Current best internal CV score: 0.9025000000000001

Best pipeline: LogisticRegression(CombineDFs(input_matrix, input_matrix), C=1.0, solver=liblinear)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9683333333333334

Generation 2 - Current best internal CV score: 0.9683333333333334

Generation 3 - Current best internal CV score: 0.9683333333333334

Generation 4 - Current best internal CV score: 0.9683333333333334

Generation 5 - Current best internal CV score: 0.9683333333333334

Best pipeline: RandomForestClassifier(LogisticRegression(input_matrix, C=0.1, solver=liblinear), max_depth=10, max_features=0.7, n_estimators=50)
Accuracy: 0.6166666666666667
Hamming Loss: 0.06791666666666667
