<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/IMDB_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# download IMDB dataset
!wget "https://raw.githubusercontent.com/javaidnabi31/Word-Embeddding-Sentiment-Classification/master/movie_data.csv" -O "movie_data.csv"

# list files in current directory
!ls -lah

In [35]:
# Import necessary libraries and functions
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from tensorflow import keras
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# download Punkt Sentence Tokenizer
nltk.download('punkt')
# download stopwords
nltk.download('stopwords')

In [None]:
# Read the data in pandas frame
path_to_data="/content/movie_data.csv"
data=pd.read_csv(path_to_data)
data.head()

In [None]:
# Check the shape of the data
print("This data contains: "+str(data.shape[0])+" rows and "+str(data.shape[1])+" columns.")

In [None]:
# Check the distribution of data over the classes
data['sentiment'].value_counts()

In [None]:
# Search for nulls
data.isnull().sum()

In [5]:
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

# define cleaning function
def clean_review(text):
  # convert to lower case
  text = text.lower()

  # remove non alphabetic characters ^
  text = re.sub(r'[^a-z]', ' ', text)

  # stem words
  # tokenize sentences
  tokens = word_tokenize(text)

  # Porter Stemmer
  stemmed = [stemmer.stem(word) for word in tokens]

  # reconstruct the text
  text = ' '.join(stemmed)

  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in english_stopwords])

  return text

In [6]:
# Apply the cleaning_function on review column
data['review']=data['review'].apply(clean_review)

In [None]:
# Split data into 70% training & 30% test
X = data['review'].values
y = data['sentiment'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# define a CountVectorizer (with binary=True and max_features=10000)
vectorizer = CountVectorizer(binary = True, max_features = 10000)

# learn the vocabulary of all tokens in our training dataset
vectorizer.fit(x_train)

# transform x_train to bag of words
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# Define and fit your model(LogisticRegression)
model=LogisticRegression()
model.fit(x_train,y_train)

In [None]:
# Calculate evaluation metrics
y_hat=model.predict(x_test)
accuracy = accuracy_score(y_test, y_hat)
precision = precision_score(y_test, y_hat)
recall = recall_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat)
confusion = confusion_matrix(y_test, y_hat)

# Create a table report
report = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "Value": [accuracy, precision, recall, f1]
})

# Display the confusion matrix
confusion_df = pd.DataFrame(confusion, columns=["Predicted 0", "Predicted 1"], index=["Actual 0", "Actual 1"])

print("Evaluation Metrics:")
print(report)
print("\nConfusion Matrix:")
print(confusion_df)


In [31]:
# define predict function
def review_predictor(model, vectorizer, review):
    review = clean_review(review)
    review_bow = vectorizer.transform([review])
    return model.predict(review_bow)[0]

In [None]:
# Try an example
review="This movie was boring!"
review_predictor(model,vectorizer,review)