In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install tf-models-official

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from official.nlp import optimization  # to create AdamW optimizer
import matplotlib.pyplot as plt

# Paths to CSV files
test_path = '/kaggle/input/testset/testerset.csv'
train_path = '/kaggle/input/trainset/trainerset.csv'

test_df = pd.read_csv(test_path).dropna(subset=['Text'])
train_df = pd.read_csv(train_path).dropna(subset=['Text'])

# Filter rows based on word count >= 35
test_df = test_df[test_df['Text'].apply(lambda x: len(x.split()) >= 35)]
train_df = train_df[train_df['Text'].apply(lambda x: len(x.split()) >= 35)]

# Separate Manifesto rows
manifesto_test = test_df[test_df['Label'] == 'Manifesto']
manifesto_train = train_df[train_df['Label'] == 'Manifesto']

In [None]:
# Randomly sample 220 rows, prioritizing Manifesto rows
n_sample = 920
test_sample = pd.concat([manifesto_test, test_df.sample(n=n_sample - len(manifesto_test))])
train_sample = pd.concat([manifesto_train, train_df.sample(n=n_sample - len(manifesto_train))])

# Prepare labels and text for the model
x_train = train_sample['Text'].values
y_train = (train_sample['Label'] == 'Manifesto').astype(int).values
x_test = test_sample['Text'].values
y_test = (test_sample['Label'] == 'Manifesto').astype(int).values

x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.3, random_state=32, stratify=y_train)

In [None]:
def display_class_distribution(df, column_name='Label'):
    """
    Display the distribution of classes in a DataFrame.
    
    Parameters:
    - df: DataFrame containing the data.
    - column_name: Name of the column containing class labels.
    """
    
    class_counts = df[column_name].value_counts()
    
    for label, count in class_counts.items():
        print(f"Total rows of '{label}': {count}")
        
    print("------------------------")

# Call the function for both dataframes
display_class_distribution(train_df)
display_class_distribution(test_df)

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.3)(net)  
    net = tf.keras.layers.Dense(1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.01), name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()
optimizer = optimization.create_optimizer(init_lr=3e-5, num_train_steps=1000, num_warmup_steps=200, optimizer_type='adamw')
classifier_model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy', threshold=0)])

# Early stopping based on validation accuracy
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=2, mode='max', restore_best_weights=True)

In [None]:
history = classifier_model.fit(
    x_train, 
    y_train, 
    epochs=30,
    batch_size=52,  
    validation_data=(x_val, y_val),
    callbacks=[early_stopping]
)

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
eval_results = classifier_model.evaluate(tf.constant(x_test), y_test, return_dict=True)
print(f"Test Loss: {eval_results['loss']}")
print(f"Test Accuracy: {eval_results['binary_accuracy']}")

In [None]:
# Predict on a new input
def predict_on_input(model, input_text):
    prediction = model.predict([input_text])
    return 'Manifesto' if prediction > 0.8 else 'Nil'

# For user input prediction
user_input = input("Enter the text for prediction: ")
print(predict_on_input(classifier_model, user_input))

# For CSV prediction
def predict_on_csv(model,csv_path)
    df = pd.read_csv(kaggle/input/testset/testingnew (2).csv)
    predictions = model.predict(df['Text'].to_numpy())
    df['Predicted_Label'] = ['Manifesto' if pred > 0.8 else 'Nil' for pred in predictions]
    result_path = "/kaggle/working/predicted_results.csv"  # The result is saved in the /kaggle/working/ directory
    df.to_csv(result_path, index=False)

predict_on_csv(classifier_model, csv_path)

In [None]:

# Generate Predictions for the test set
predictions = tf.sigmoid(classifier_model(tf.constant(x_test))).numpy()

# Convert predictions to binary values based on a threshold of 0.9
predicted_labels = (predictions > 0.8).astype(int).flatten()

label_mapping = {1: 'Manifesto', 0: 'Nil'}
predicted_labels_text = [label_mapping[label] for label in predicted_labels]

result_df = pd.DataFrame({
    'Original Text': x_test,
    'Original Label': y_test, 
    'Predicted Label': predicted_labels_text,
    'Confidence Score': predictions.flatten()
})

# Save the DataFrame to a CSV file
result_df.to_csv('/kaggle/working/predictions.csv', index=False)

In [None]:
# Generate Predictions for the test set
predictions = tf.sigmoid(classifier_model(tf.constant(x_test))).numpy()

# Convert predictions to binary values based on a threshold of 0.8
predicted_labels = (predictions > 0.7).astype(int).flatten()

label_mapping = {1: 'Manifesto', 0: 'Nil'}
predicted_labels_text = [label_mapping[label] for label in predicted_labels]

result_df = pd.DataFrame({
    'Original Text': x_test,
    'Original Label': [label_mapping[label] for label in y_test],
    'Predicted Label': predicted_labels_text,
    'Confidence Score': predictions.flatten()
})

# Identify incorrect predictions
incorrect_predictions_indices = np.where(predicted_labels != y_test)[0]
incorrect_df = result_df.iloc[incorrect_predictions_indices]

# Save the incorrect predictions DataFrame to a CSV file
incorrect_df.to_csv('/kaggle/working/incorrect_predictions.csv', index=False)

# Save the full result DataFrame to a CSV file
result_df.to_csv('/kaggle/working/predictions.csv', index=False)

In [None]:
dataset_name = 'Manifesto'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)