In [4]:
# pip install pandas nltk scikit-learn tensorflow vaderSentiment tensorflow_datasets textblob

In [1]:
import pandas as pd
import tensorflow_datasets as tfds

In [2]:
# Load the dataset
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [3]:
# Convert to DataFrame
train_data, test_data = dataset['train'], dataset['test']
train_df = tfds.as_dataframe(train_data, info)
test_df = tfds.as_dataframe(test_data, info)

# Decode bytes to string
train_df['text'] = train_df['text'].str.decode('utf-8')
test_df['text'] = test_df['text'].str.decode('utf-8')

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Sample application on a single review
sample_review = test_df['text'].iloc[0]
print(sample_review)
scores = analyzer.polarity_scores(sample_review)
print("VADER Polarity Scores:", scores)

There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.
VADER Polarity Scores: {'neg': 0.072, 'neu': 0.783, 'pos': 0

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Vectorize text
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Evaluate the model
y_pred = clf.predict(X_test_counts)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))

Naive Bayes Accuracy: 0.8516


In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set the maximum number of words to keep, based on word frequency
max_words = 10000
# Max number of words in each complaint.
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['text'])

sequences = tokenizer.texts_to_sequences(train_df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 1329473 (5.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_val, y_train, y_val = train_test_split(padded_sequences,
                                                  train_df['label'],
                                                  test_size=0.2,
                                                  random_state=42)

# Train the model
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# Preprocess the test data
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
test_padded = pad_sequences(test_sequences, maxlen=max_len)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(test_padded, test_df['label'], verbose=2)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

782/782 - 12s - loss: 0.5234 - accuracy: 0.8476 - 12s/epoch - 16ms/step
Test Loss: 0.523405909538269
Test Accuracy: 0.8476399779319763


Notes
- **Preprocessing**: The maximum number of words (max_words) and the maximum sequence length (max_len) are parameters that you can tune. Adjusting these can affect both the performance of your model and the training time.

- **Model Architecture**: The LSTM layer parameters (number of units, dropout) and the embedding dimension are also tunable. Experimenting with different values can help improve your model's accuracy.

- **Training**: The number of epochs and batch size are important parameters that influence how well your model learns. Too few epochs might underfit, while too many might overfit, especially without proper regularization or dropout.