In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'numpy.exceptions'

In [None]:
# Load the dataset
df = pd.read_csv(r'resources\archive_for_multi_label\train.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Check label distribution
label_columns = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
print(f"\n\nLabel distribution:")
for col in label_columns:
    print(f"{col}: {df[col].sum()} papers ({df[col].mean()*100:.1f}%)")

In [None]:
# Prepare the data
# Combine TITLE and ABSTRACT for better context
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Extract features (X) and labels (y)
X = df['text'].values
y = df[label_columns].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nExample text (first 200 chars):")
print(X[0][:200])
print(f"\nCorresponding labels: {y[0]}")

In [None]:
# Split the data into train and test sets (using only train.csv as instructed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Text preprocessing with Tokenizer
max_words = 10000  # Maximum number of words to keep
max_len = 200      # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"X_train_pad shape: {X_train_pad.shape}")
print(f"X_test_pad shape: {X_test_pad.shape}")

In [None]:
# Build the multi-label classification model
num_classes = len(label_columns)
embedding_dim = 128

model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    layers.Dropout(0.3),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(num_classes, activation='sigmoid')  # Sigmoid for multi-label classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# Display model summary
model.summary()

In [None]:
# Train the model
epochs = 20
batch_size = 32

history = model.fit(
    X_train_pad, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,
    verbose=1
)

print("\nTraining completed!")

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Loss
axes[0, 0].plot(history.history['loss'], label='Training Loss')
axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
axes[0, 0].set_title('Model Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy
axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0, 1].set_title('Model Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Precision
axes[1, 0].plot(history.history['precision'], label='Training Precision')
axes[1, 0].plot(history.history['val_precision'], label='Validation Precision')
axes[1, 0].set_title('Model Precision')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Precision')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Recall
axes[1, 1].plot(history.history['recall'], label='Training Recall')
axes[1, 1].plot(history.history['val_recall'], label='Validation Recall')
axes[1, 1].set_title('Model Recall')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Recall')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Make predictions on test set
y_pred_prob = model.predict(X_test_pad)

# Apply threshold to convert probabilities to binary predictions
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

print(f"Predictions shape: {y_pred.shape}")
print(f"\nExample prediction probabilities:\n{y_pred_prob[0]}")
print(f"\nExample binary prediction: {y_pred[0]}")
print(f"Actual labels: {y_test[0]}")

In [None]:
# Calculate accuracy metrics (following the provided code pattern)
num_rows_y_test = len(y_test)
num_columns_y_test = len(y_test[0])

print(f"Number of test samples: {num_rows_y_test}")
print(f"Number of classes: {num_columns_y_test}")
print()

correct = 0
correct_classes = np.zeros(num_classes)

for row in range(num_rows_y_test):
    for col in range(num_columns_y_test):
        if y_test[row][col] == y_pred[row][col]:
            correct = correct + 1
            correct_classes[col] = correct_classes[col] + 1

print(f"Total correct predictions: {correct}")
print(f"Overall accuracy: {correct/(num_rows_y_test*num_columns_y_test):.4f}")
print()

# Per-class accuracy
print("Per-class accuracy:")
for i, label in enumerate(label_columns):
    class_accuracy = correct_classes[i] / num_rows_y_test
    print(f"  {label}: {class_accuracy:.4f} ({int(correct_classes[i])}/{num_rows_y_test})")

print()
best_class_idx = correct_classes.argmax()
print(f"Best accuracy is for class: '{label_columns[best_class_idx]}'")
print(f"Accuracy: {correct_classes[best_class_idx]/num_rows_y_test:.4f}")

In [None]:
# Additional evaluation metrics
from sklearn.metrics import classification_report, hamming_loss, jaccard_score

print("="*60)
print("DETAILED CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, target_names=label_columns, zero_division=0))

print("\n" + "="*60)
print("ADDITIONAL METRICS")
print("="*60)
print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.4f}")
print(f"Jaccard Score (samples): {jaccard_score(y_test, y_pred, average='samples'):.4f}")
print(f"Jaccard Score (macro): {jaccard_score(y_test, y_pred, average='macro'):.4f}")

In [None]:
# Test the model with a new abstract
def predict_topics(text):
    """
    Predict topics for a given abstract text
    """
    # Preprocess the text
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    
    # Make prediction
    pred_prob = model.predict(padded, verbose=0)[0]
    pred_binary = (pred_prob > threshold).astype(int)
    
    # Display results
    print(f"Text: {text[:150]}...\n")
    print("Predicted topics:")
    for i, label in enumerate(label_columns):
        if pred_binary[i] == 1:
            print(f"  ✓ {label} (confidence: {pred_prob[i]:.3f})")
    
    print("\nAll probabilities:")
    for i, label in enumerate(label_columns):
        bar = '█' * int(pred_prob[i] * 50)
        print(f"  {label:25s}: {bar} {pred_prob[i]:.3f}")
    
    return pred_prob, pred_binary

# Example: Test with the first test sample
print("="*60)
print("EXAMPLE PREDICTION")
print("="*60)
example_text = X_test[0]
predict_topics(example_text)