**IMPORT RELEVANT LIBRARIES**

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

**IMPORT DATA FOR TRAINING**

In [3]:
(training, testing), info= tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True, with_info=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.FFWWIP_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.FFWWIP_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.FFWWIP_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [4]:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)


True

**PREPROCESS THE DATASET**

In [34]:
def process_data(dataset):
  texts = []
  labels = []
  for text, label in dataset:
        texts.append(text.numpy().decode('utf-8'))
        labels.append(label.numpy())
  return pd.DataFrame({'text': texts, 'sentiment': labels})

In [35]:
train_df = process_data(training)
test_df = process_data(testing)

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

Training data shape: (25000, 2)
Testing data shape: (25000, 2)


In [36]:
def preprocess_data(text):

  text= text.lower()
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  tokens= word_tokenize(text)
  stop_words= set(stopwords.words('english'))
  filtered_tokens= [token for token in tokens if token not in stop_words]
  Lemmatizer= WordNetLemmatizer()
  Lemmatized_tokens= [Lemmatizer.lemmatize(token) for token in filtered_tokens]
  return ' '.join(Lemmatized_tokens)

**PROCESS TRAINING DATA**

In [37]:
train_df['processed_text']=train_df['text'].apply(preprocess_data)
test_df['processed_text']=test_df['text'].apply(preprocess_data)

In [38]:
print("Extracting Features from Data")
vectorizer= TfidfVectorizer(max_features=5000)
train_X= vectorizer.fit_transform(train_df['processed_text'])
test_X= vectorizer.transform(test_df['processed_text'])

y_train = train_df['sentiment']
y_test = test_df['sentiment']



Extracting Features from Data


**MODEL SELECTION**

In [56]:
from sklearn import svm
classi= svm.SVC(kernel='linear')
classi.fit(train_X, y_train)

In [57]:
prediction= classi.predict(test_X)
accuracy= accuracy_score(y_test, prediction)
print("Accuracy:", accuracy)

Accuracy: 0.87036


**TESTING THE MODEL ON SAMPLE DATA**

In [58]:
def predict_sentiment(text):
    preprocessed_text = preprocess_data(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    prediction = classi.predict(vectorized_text)
    return "Positive" if prediction[0] == 1 else "Negative"


In [59]:
test_case = "I wasn't sure what to expect from From. I've read everything from it's like Lost to it's a generic horror version of Lost. Lost is one of my favorite shows ever and the reviews for this have been mostly positive so I figured I'd give it a try and I'm glad I did. I was hooked from the first episode to the last. I'm so glad they already renewed this because there are so many unanswered questions still. That's probably the only negative thing I have say about his show, that it didn't offer enough answers in the first season. It takes a little time to really get going but once it does you'll be addicted and want to binge it as quickly as possible."
sentiment = predict_sentiment(test_case)
print(f"\nText: {test_case}")
print(f"Predicted Sentiment: {sentiment}")



Text: I wasn't sure what to expect from From. I've read everything from it's like Lost to it's a generic horror version of Lost. Lost is one of my favorite shows ever and the reviews for this have been mostly positive so I figured I'd give it a try and I'm glad I did. I was hooked from the first episode to the last. I'm so glad they already renewed this because there are so many unanswered questions still. That's probably the only negative thing I have say about his show, that it didn't offer enough answers in the first season. It takes a little time to really get going but once it does you'll be addicted and want to binge it as quickly as possible.
Predicted Sentiment: Positive




> Hafsa Rafique

