In [1]:
import pandas as pd
from keras.models import load_model

In [6]:
df = pd.read_csv("/kaggle/input/deeplearing-project/top_10_tags_encoded.csv")


# Cross Validation

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Parameters
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300

# Tokenize
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])

# Pad
X_tokenized_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Labels (assuming 'clean_text' is column 0 and the next 10 columns are one-hot encoded labels)
y = df.iloc[:, 1:].values

In [9]:
 !pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [10]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
import numpy as np

In [11]:
# Define parameters
n_splits = 5
batch_size = 128
epochs = 3  # for quick CV; you can increase

mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold = 1
f1_scores = []

for train_idx, val_idx in mskf.split(X_tokenized_padded, y):
    print(f"\n🔁 Fold {fold}")
    X_train, X_val = X_tokenized_padded[train_idx], X_tokenized_padded[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Model
    model = Sequential([
        Embedding(input_dim=MAX_NUM_WORDS, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),
        LSTM(64),
        Dense(10, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_val, y_val))

    # Predict and threshold at 0.5
    y_pred_probs = model.predict(X_val)
    y_pred_binary = (y_pred_probs >= 0.5).astype(int)

    f1 = f1_score(y_val, y_pred_binary, average='macro')
    print(f"🟢 F1 Score for Fold {fold}: {f1:.4f}")
    f1_scores.append(f1)
    fold += 1

print(f"\n✅ Average F1 Score: {np.mean(f1_scores):.4f}")


🔁 Fold 1




Epoch 1/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 17ms/step - accuracy: 0.3654 - loss: 0.2794 - val_accuracy: 0.8014 - val_loss: 0.1044
Epoch 2/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8170 - loss: 0.0944 - val_accuracy: 0.8267 - val_loss: 0.0867
Epoch 3/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8365 - loss: 0.0810 - val_accuracy: 0.8268 - val_loss: 0.0829
[1m4417/4417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step
🟢 F1 Score for Fold 1: 0.8437

🔁 Fold 2




Epoch 1/3
[1m4413/4413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 17ms/step - accuracy: 0.3290 - loss: 0.2924 - val_accuracy: 0.7738 - val_loss: 0.1199
Epoch 2/3
[1m4413/4413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8141 - loss: 0.0984 - val_accuracy: 0.8221 - val_loss: 0.0891
Epoch 3/3
[1m4413/4413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8356 - loss: 0.0816 - val_accuracy: 0.8311 - val_loss: 0.0816
[1m4406/4406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step
🟢 F1 Score for Fold 2: 0.8510

🔁 Fold 3




Epoch 1/3
[1m4411/4411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 17ms/step - accuracy: 0.3358 - loss: 0.2904 - val_accuracy: 0.7980 - val_loss: 0.1056
Epoch 2/3
[1m4411/4411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8118 - loss: 0.0973 - val_accuracy: 0.8275 - val_loss: 0.0846
Epoch 3/3
[1m4411/4411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8340 - loss: 0.0815 - val_accuracy: 0.8312 - val_loss: 0.0809
[1m4411/4411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step
🟢 F1 Score for Fold 3: 0.8569

🔁 Fold 4




Epoch 1/3
[1m4412/4412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 17ms/step - accuracy: 0.3742 - loss: 0.2747 - val_accuracy: 0.8051 - val_loss: 0.1040
Epoch 2/3
[1m4412/4412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8172 - loss: 0.0937 - val_accuracy: 0.8299 - val_loss: 0.0843
Epoch 3/3
[1m4412/4412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8359 - loss: 0.0805 - val_accuracy: 0.8354 - val_loss: 0.0801
[1m4408/4408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step
🟢 F1 Score for Fold 4: 0.8568

🔁 Fold 5




Epoch 1/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 17ms/step - accuracy: 0.2926 - loss: 0.3027 - val_accuracy: 0.8004 - val_loss: 0.1074
Epoch 2/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8094 - loss: 0.0987 - val_accuracy: 0.8301 - val_loss: 0.0859
Epoch 3/3
[1m4410/4410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.8342 - loss: 0.0815 - val_accuracy: 0.8344 - val_loss: 0.0818
[1m4414/4414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step
🟢 F1 Score for Fold 5: 0.8564

✅ Average F1 Score: 0.8530
