In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

# Separate features (reviews) and target (sentiment)
X = df['review']
y = df['sentiment']
print(df['sentiment'].unique())


In [None]:
# Tokenize the text data
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Convert text data to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure uniform length
max_sequence_length = 100  # You may adjust this based on the maximum length of reviews
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)


In [None]:
# Define the LSTM model
embedding_dim = 100  # Dimensionality of the word embeddings
lstm_units = 128  # Number of LSTM units
num_classes = 1  # Binary classification

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(units=lstm_units),
    Dense(units=num_classes, activation='sigmoid')
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy)


In [None]:
# Function to preprocess user input
def preprocess_input(text, tokenizer, max_sequence_length):
    # Tokenize the input text
    sequence = tokenizer.texts_to_sequences([text])
    # Pad sequences to make them of equal length
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length)
    return padded_sequence

# Function to predict sentiment
def predict_sentiment(text, model, tokenizer, max_sequence_length):
    preprocessed_text = preprocess_input(text, tokenizer, max_sequence_length)
    # Predict sentiment
    sentiment = model.predict(preprocessed_text)
    return sentiment

In [None]:
user_input = input("Enter your text: ")

# Preprocess and predict sentiment
sentiment = predict_sentiment(user_input, model, tokenizer, max_sequence_length)

# Output the sentiment prediction
if sentiment > 0.7:
    print("Positive sentiment")
else:
    print("Negative sentiment")