## Reading csv file

In [11]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv("Movie_reviews.csv")

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,review,sentiment
0,This is the kind of film you want to see with ...,positive
1,"First, this was a BRAVE film. I've seen Irreve...",positive
2,I'm not sure why the producers needed to trade...,negative
3,I don't know any idiotic rock'n'roll cliché no...,negative
4,Four stories about the drug trade in Europe be...,positive


## Preprocessing

In [4]:
# Assuming the columns are named 'review' and 'sentiment'
df = df[['review', 'sentiment']]

# Encode the sentiment labels if they are not numerical
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Drop any rows with missing values
df.dropna(inplace=True)

# Split the data into features (X) and labels (y)
X = df['review'].values
y = df['sentiment'].values


## Sentence Transformer

In [6]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)


In [7]:
from sentence_transformers import SentenceTransformer

# Load the pretrained Sentence Transformer model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings for the reviews
embeddings = sbert_model.encode(X)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Training on NN

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

# Convert labels to categorical (one-hot encoding)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Define the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(384,)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))  # 2 classes (positive and negative)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c2f63b674c0>

In [9]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


Test Accuracy: 0.81


## Testing the accuracy of our model

In [16]:
def predict_sentiment(review):
    # Generate embedding for the new review
    review_embedding = sbert_model.encode([review])
    # Predict sentiment
    prediction = model.predict(review_embedding)
    # Convert prediction to class label
    sentiment = np.argmax(prediction, axis=1)[0]
    return 'positive' if sentiment == 1 else 'negative'

# Test the function with a new review
new_review = input("Please enter a review ")
predicted_sentiment = predict_sentiment(new_review)
print(f"Sentiment of the review is: {predicted_sentiment}")

Please enter a review pathetic to say the least
Sentiment of the review is: negative
