In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Initialize lists to hold the labels and text data
labels = []
text = []

# Open the file and read each line
with open('/notebooks/train.ft.txt.bz2', 'r') as f:
    for line in f:
        # The label is the part between '_label_' and the first space
        labels.append(int(line.split(' ')[0].replace('_label_', '')))

        # The text is the part after the first space
        text.append(line.split(' ', 1)[1].rstrip())

# Create a DataFrame from the labels and text
df = pd.DataFrame(list(zip(labels, text)), columns=['label', 'text'])

# Download necessary NLTK data
nltk.download(['punkt', 'wordnet', 'stopwords'])

# Initialize a WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define the English stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert the text to lower case
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    word_tokens = word_tokenize(text)

    # Remove stopwords and lemmatize the words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words]

    # Join the words back into a single string
    text = ' '.join(lemmatized_words)

    return text

# Apply the preprocessing to the 'text' column of the DataFrame
df['text'] = df['text'].apply(preprocess_text)

# Subtract 1 from the labels to make them 0 (for negative) and 1 (for positive)
df['label'] = df['label'] - 1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Initialize a tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Tokenize the data
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)   500x500 , 100x100

# Pad the sequences
X_train_padded = pad_sequences(X_train_sequences, padding='post')
X_test_padded = pad_sequences(X_test_sequences, padding='post', maxlen=X_train_padded.shape[1])

# Define the model
model = Sequential([
    Embedding(10000, 16, input_length=X_train_padded.shape[1]),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(2, activation='relu'),
    Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10)

# Load the test data
labels_test = []
text_test = []

with open('/notebooks/test.ft.txt.bz2', 'r') as f:
    for line in f:
        labels_test.append(int(line.split(' ')[0].replace('_label_', '')))
        text_test.append(line.split(' ', 1)[1].rstrip())

# Create a DataFrame from the labels and text
df_test = pd.DataFrame(list(zip(labels_test, text_test)), columns=['label', 'text'])

# Preprocess the text
df_test['text'] = df_test['text'].apply(preprocess_text)

# Subtract 1 from the labels to make them 0 (for negative) and 1 (for positive)
df_test['label'] = df_test['label'] - 1

# Tokenize and pad the sequences
X_test_sequences = tokenizer.texts_to_sequences(df_test['text'])
X_test_padded = pad_sequences(X_test_sequences, padding='post', maxlen=X_train_padded.shape[1])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_padded, df_test['label'])

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

SyntaxError: ignored