In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# Import necessary libraries
# Import necessary libraries
# ...

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Load a smaller portion of the dataset (e.g., first 1000 rows)
df = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv', nrows=1000)

# Handling missing values
df.dropna(inplace=True)

# Data Transformation
df['Text_Length'] = df['Text'].apply(len)

# Text classification using Naive Bayes
text_data = df['Text']
labels = df['Score'].apply(lambda score: 1 if score > 3 else 0)

# Tokenize and pad the sequences
max_words = 500
max_len_values = [30, 50, 70]  # Adjust these values based on your experimentation

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
padded_sequences = pad_sequences(sequences, maxlen=max_len_values[-1], padding='post', truncating='post')

# Train-test split for Naive Bayes
train_texts, test_texts, train_labels, test_labels = train_test_split(text_data, labels, test_size=0.2, random_state=101)

# Naive Bayes using CountVectorizer
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)

classifier = MultinomialNB()
classifier.fit(train_features, train_labels)
predictions_nb = classifier.predict(test_features)

# Evaluate Naive Bayes
accuracy_nb = accuracy_score(test_labels, predictions_nb)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Classification Report (Naive Bayes):\n", classification_report(test_labels, predictions_nb))

# Define early stopping and model checkpoint callbacks for the LSTM model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_lstm_model.h5', save_best_only=True)

# Define the LSTM model with callbacks
def create_lstm_model(embedding_dim=50, lstm_units=32):
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len_values[-1]),
        LSTM(lstm_units),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train the LSTM model with callbacks
# Train the LSTM model with callbacks
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
lstm_model = create_lstm_model()
lstm_model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stopping, model_checkpoint])


# Evaluate the LSTM model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test)
print(f"LSTM Model Accuracy: {lstm_accuracy}")

# Load the best model from the checkpoint
best_lstm_model = create_lstm_model()
best_lstm_model.load_weights('best_lstm_model.h5')



/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv
/kaggle/input/amazon-fine-food-reviews/database.sqlite
Naive Bayes Accuracy: 0.86
Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           0       0.64      0.23      0.33        31
           1       0.87      0.98      0.92       169

    accuracy                           0.86       200
   macro avg       0.75      0.60      0.63       200
weighted avg       0.84      0.86      0.83       200

Epoch 1/20
Epoch 2/20
 5/25 [=====>........................] - ETA: 0s - loss: 0.5791 - accuracy: 0.7563

  saving_api.save_model(


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
LSTM Model Accuracy: 0.8199999928474426


In [2]:
# Use best_lstm_model for predictions
sample_texts = ["This product is amazing!", "The Food was great."]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_padded_sequences = pad_sequences(sample_sequences, maxlen=max_len_values[-1], padding='post', truncating='post')

sample_predictions = best_lstm_model.predict(sample_padded_sequences)
for i, text in enumerate(sample_texts):
    sentiment = "Positive" if sample_predictions[i] > 0.5 else "Negative"
    print(f"Review: {text}\nPredicted Sentiment: {sentiment}\n")


Review: This product is amazing!
Predicted Sentiment: Positive

Review: The Food was great.
Predicted Sentiment: Positive

