<a href="https://colab.research.google.com/github/Geethasri0719/AI/blob/main/Task-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scikit-learn tensorflow keras




In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/IMDB Dataset.csv')

# Check the first few rows
print(data.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Encode sentiment labels
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])  # positive=1, negative=0

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42
)

# Tokenize and pad sequences
max_vocab_size = 10000
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_sequence_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()




In [None]:
history = model.fit(
    X_train_padded, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_padded, y_test),
    verbose=2
)


Epoch 1/5
625/625 - 146s - 233ms/step - accuracy: 0.5902 - loss: 0.6400 - val_accuracy: 0.6476 - val_loss: 0.5597
Epoch 2/5
625/625 - 202s - 323ms/step - accuracy: 0.7718 - loss: 0.4739 - val_accuracy: 0.5869 - val_loss: 0.6261
Epoch 3/5
625/625 - 203s - 325ms/step - accuracy: 0.8102 - loss: 0.4557 - val_accuracy: 0.8440 - val_loss: 0.4194
Epoch 4/5
625/625 - 143s - 229ms/step - accuracy: 0.8493 - loss: 0.3843 - val_accuracy: 0.8588 - val_loss: 0.3429
Epoch 5/5
625/625 - 203s - 324ms/step - accuracy: 0.8902 - loss: 0.2845 - val_accuracy: 0.8642 - val_loss: 0.3480


In [None]:
from sklearn.metrics import classification_report

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Generate predictions
predictions = (model.predict(X_test_padded) > 0.5).astype("int32")
print(classification_report(y_test, predictions))


Test Accuracy: 0.86
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 36ms/step
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      4961
           1       0.89      0.83      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000



In [None]:
model.save('kaggle_sentiment_model.h5')




In [None]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('kaggle_sentiment_model.h5')

# Predict
sample_review = ["The plot was engaging and characters were well-developed."]
sample_sequence = tokenizer.texts_to_sequences(sample_review)
sample_padded = pad_sequences(sample_sequence, maxlen=max_sequence_length, padding='post')
prediction = model.predict(sample_padded)
print("Positive" if prediction[0] > 0.5 else "Negative")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
Negative
