In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import bz2
import re

pd.set_option("display.max_columns", None)

In [2]:
# Read file with bz2
y = []
x = []

count = 0
for line in bz2.BZ2File('Data/train.ft.txt.bz2'):
    tmp = line.decode('utf-8')
    y.append(int(tmp[9]) - 1)
    x.append(tmp[11:].strip())

df = pd.DataFrame({'Label': y, 'Review': x})

print(f'{df.shape = }')
display(df.head())

df.shape = (3600000, 2)


Unnamed: 0,Label,Review
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [3]:
# For performance issues
df = df.iloc[:500000]

In [4]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [5]:
# Text cleaning:
# Change all text to lowercase
# Removing all non letters (ponctuation, numbers...)
df['Review'] = df['Review'].apply(lambda x: re.compile(r"[^a-z\s]").sub(r" ", x.lower()))

In [6]:
# Splitting dataset into train and test.
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Label'], test_size=0.2, random_state=1)

print(f'{X_train.shape = }, {X_test.shape = }')

X_train.shape = (400000,), X_test.shape = (100000,)


In [7]:
max_vocab = 1000
max_length = max([len(x.split(' ')) for x in X_train])

In [8]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=max_length,
)

vectorize_layer.adapt(X_train)

In [9]:
def build_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype='string'))
    model.add(vectorize_layer)
    model.add(tf.keras.layers.Embedding(input_dim=max_vocab, output_dim=128, embeddings_initializer='uniform', input_length=max_length))
    model.add(tf.keras.layers.Conv1D(64, 3, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.GlobalMaxPool1D())
    model.add(tf.keras.layers.Flatten()) 
    model.add(tf.keras.layers.Dense(20, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['binary_accuracy'])

    return model

In [10]:
model = build_model()
model.fit(X_train,y_train,batch_size=128,epochs=2,validation_data=(X_test,y_test))

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x1ce43ce29b0>