In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline


In [2]:
import os

In [4]:
# Define a function to load labels and text from the dataset

def get_labels_and_texts(file):
def get_labels_and_texts(file):
    labels = []
    texts = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            labels.append(int(line[9]) - 1)
            texts.append(line[10:].strip())
    return np.array(labels), texts

# Load data from the extracted .txt files
train_labels, train_texts = get_labels_and_texts('train.ft.txt')
test_labels, test_texts = get_labels_and_texts('test.ft.txt')

# Check the first few labels and texts
print(train_labels[:5])
print(train_texts[:5])


[1 1 1 1 1]
['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', "The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.", 'Amazing!: This soundtrack is my favorite mu

In [5]:
# Text normalization: cleaning the text data by removing non-alphanumeric characters

import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')

def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts

# Apply text normalization to the training and testing data
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [6]:
# Split the training data into training and validation sets

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.2)

In [7]:
# Tokenization: Convert text data into numeric sequences for model input
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [8]:
# Padding: Ensure that all sequences are of the same length

MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [10]:
from tensorflow.keras import models, layers

# Building a CNN model for text classification
def build_model():
    # Input layer for the sequences (word indexes)
    sequences = layers.Input(shape=(MAX_LENGTH,))
    # Embedding layer to map word indexes into dense vectors
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    
    # First convolution layer followed by batch normalization and max-pooling
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    
    # Second convolution layer with larger kernel size
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    
    # Final convolution layer followed by global pooling
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    
    # Flattening the pooled output and adding dense layers for final classification
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    
    # Output layer for binary classification with sigmoid activation
    predictions = layers.Dense(1, activation='sigmoid')(x)
    
    # Compile the model with optimizer, loss function, and evaluation metric
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model

# Build and compile the CNN model
model = build_model()

In [11]:
# Train the model on the training data and validate on validation data
model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(val_texts, val_labels), )

Epoch 1/2
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1148s[0m 51ms/step - binary_accuracy: 0.9215 - loss: 0.1968 - val_binary_accuracy: 0.9457 - val_loss: 0.1470
Epoch 2/2
[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1245s[0m 55ms/step - binary_accuracy: 0.9485 - loss: 0.1403 - val_binary_accuracy: 0.9492 - val_loss: 0.1391


<keras.src.callbacks.history.History at 0x3dfcbb150>

In [12]:
# Evaluate the model on the test data using accuracy, F1 score, and ROC AUC
preds = model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 4ms/step
Accuracy score: 0.9488
F1 score: 0.9487
ROC AUC score: 0.9874
