In [5]:
import numpy as np
import warnings
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd

# Suppress warnings
warnings.filterwarnings("ignore")


In [6]:
# Cell 2: Load and inspect the dataset
# Load datasets
train_df = pd.read_csv('processed_train_df.csv', header=None)
test_df = pd.read_csv('processed_test_df.csv', header=None)

# Assign column names
train_df.columns = ['label', 'title', 'review']
test_df.columns = ['label', 'title', 'review']

# Inspect datasets
print("Training Data Sample:")
print(train_df.head())

print("\nTesting Data Sample:")
print(test_df.head())


Training Data Sample:
   label                            title  \
0      2           stuning even non gamer   
1      2    best soundtrack ever anything   
2      2                          amazing   
3      2             excellent soundtrack   
4      2  remember pull jaw floor hearing   

                                              review  
0  sound track beautiful paint senery mind well w...  
1  reading lot review saying best game soundtrack...  
2  soundtrack favorite music time hand intense sa...  
3  truly like soundtrack enjoy video game music p...  
4  played game know divine music every single son...  

Testing Data Sample:
   label                                            title  \
0      2                                         great cd   
1      2  one best game music soundtrack game really play   
2      1                         battery died within year   
3      2                     work fine maha energy better   
4      2                             great non aud

In [7]:
# Cell 3: Preprocess the data
def preprocess_data(train_df, test_df):
    # Drop rows with missing reviews
    train_df = train_df.dropna(subset=['review'])
    test_df = test_df.dropna(subset=['review'])

    # Map labels
    train_df['label'] = train_df['label'].map({2: 1, 1: 0})
    test_df['label'] = test_df['label'].map({2: 1, 1: 0})

    # Replace NaN labels in the test set with 0
    test_df['label'] = test_df['label'].fillna(0).astype(int)

    # Convert labels in training set to integers
    train_df['label'] = train_df['label'].astype(int)

    return train_df, test_df

train_df, test_df = preprocess_data(train_df, test_df)

# Extract features and labels
X_train, y_train = train_df['review'], train_df['label']
X_test, y_test = test_df['review'], test_df['label']

# Verify preprocessing
print("Unique labels in training set:", y_train.unique())
print("Unique labels in testing set:", y_test.unique())


Unique labels in training set: [1 0]
Unique labels in testing set: [1 0]


In [8]:
# Cell 4: Tokenize and pad sequences
import pickle
def preprocess_text(X_train, X_test, vocab_size, max_length):
    # Convert all reviews to strings and handle NaN values
    X_train = X_train.fillna("").astype(str)
    X_test = X_test.fillna("").astype(str)

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

    return X_train_pad, X_test_pad, tokenizer

# Define parameters
VOCAB_SIZE = 20000
MAX_LENGTH = 194

# Preprocess text data
X_train_pad, X_test_pad, tokenizer = preprocess_text(X_train, X_test, VOCAB_SIZE, MAX_LENGTH)

# Verify padded sequences
print("Shape of X_train_pad:", X_train_pad.shape)
print("Shape of X_test_pad:", X_test_pad.shape)

with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

Shape of X_train_pad: (3599904, 194)
Shape of X_test_pad: (399989, 194)


In [9]:
# Cell 5: Define the Keras model generator class
class KerasModelGenerator:
    """
    Class to encapsulate the creation of a Keras Sequential model.
    """
    def __init__(self, vocab_size, embedding_dim, max_length):
        """
        Initialize the KerasModelGenerator.

        Parameters:
        - vocab_size (int): Size of the vocabulary for the embedding layer.
        - embedding_dim (int): Number of dimensions for word embeddings.
        - max_length (int): Maximum sequence length.
        """
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_length = max_length
        self.model = None  # Placeholder for the generated model
    
    def build(self):
        """
        Build and return a Keras Sequential model.
        """
        self.model = Sequential([
            Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.max_length),
            Flatten(),
            Dense(128, activation='relu'),
            Dropout(0.5),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='sigmoid')
        ])
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return self.model


In [10]:
# Cell 6: Train and evaluate with k-fold cross-validation
def train_and_evaluate_with_kfolds(X, y, vocab_size, embedding_dim, max_length, n_splits=5, epochs=5, batch_size=128):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
        print(f"\nStarting Fold {fold}...")

        # Split data into training and validation for this fold
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Build the model
        keras_generator = KerasModelGenerator(vocab_size, embedding_dim, max_length)
        model = keras_generator.build()

        # Train the model
        model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            verbose=1
        )
        model.save(f'model_fold_{fold}.h5')
        # Evaluate on validation data
        y_val_pred_prob = model.predict(X_val)
        y_val_pred = (y_val_pred_prob > 0.5).astype(int)

        # Collect metrics
        fold_accuracy = accuracy_score(y_val, y_val_pred)
        fold_metrics.append(fold_accuracy)

        print(f"Fold {fold} Accuracy: {fold_accuracy:.4f}")
        print(f"Classification Report for Fold {fold}:\n", classification_report(y_val, y_val_pred, target_names=['Negative', 'Positive']))

        # Clear TensorFlow session and release memory
        from tensorflow.keras import backend as K
        K.clear_session()
        import gc
        gc.collect()

    # Compute average metrics
    avg_metrics = {
        "accuracy": np.mean(fold_metrics)
    }
    print("\nAverage Metrics Across Folds:")
    print(avg_metrics)
    return avg_metrics



In [11]:
# Cell 7: Call k-fold training and evaluation
y_train = np.array(y_train)  # Ensure y_train is a NumPy array

# Train with k-fold cross-validation
avg_metrics = train_and_evaluate_with_kfolds(
    X_train_pad, y_train,
    vocab_size=VOCAB_SIZE,
    embedding_dim=128,
    max_length=MAX_LENGTH,
    n_splits=2,  # Number of folds
    epochs=3,  # Number of epochs per fold
    batch_size=128  # Batch size
)

print("\nFinal Average Metrics Across All Folds:")
print(avg_metrics)



Starting Fold 1...


2024-12-05 18:24:39.871317: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 18:24:39.890825: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 18:24:39.890857: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 18:24:39.891758: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

Epoch 1/3
   36/14063 [..............................] - ETA: 1:04 - loss: 0.7186 - accuracy: 0.5015

2024-12-05 18:24:42.047175: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




2024-12-05 18:25:37.290706: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1396762752 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


2024-12-05 18:28:16.950987: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1396762752 exceeds 10% of free system memory.


Fold 1 Accuracy: 0.8757
Classification Report for Fold 1:
               precision    recall  f1-score   support

    Negative       0.88      0.87      0.88    899404
    Positive       0.88      0.88      0.88    900548

    accuracy                           0.88   1799952
   macro avg       0.88      0.88      0.88   1799952
weighted avg       0.88      0.88      0.88   1799952


Starting Fold 2...


2024-12-05 18:28:49.802602: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1396762752 exceeds 10% of free system memory.


Epoch 1/3

2024-12-05 18:29:48.773127: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1396762752 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3
Fold 2 Accuracy: 0.8751
Classification Report for Fold 2:
               precision    recall  f1-score   support

    Negative       0.86      0.89      0.88    900547
    Positive       0.89      0.86      0.87    899405

    accuracy                           0.88   1799952
   macro avg       0.88      0.88      0.88   1799952
weighted avg       0.88      0.88      0.88   1799952


Average Metrics Across Folds:
{'accuracy': 0.8753925104669458}

Final Average Metrics Across All Folds:
{'accuracy': 0.8753925104669458}
