In [None]:
import os
import re
import unicodedata
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import seaborn as sns
import sklearn

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix)
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter

In [None]:
import nltk
nltk.download('vader_lexicon')  

### Loading and Splitting the data 

In [None]:
def load_imdb_data(path):
    # load data from CSV file
    df = pd.read_csv(path)
    # extract text and label columns
    texts = df['review']
    labels = df['sentiment']
    # map 'positive' and 'negative' labels to 1 and 0, respectively
    labels = labels.map({'positive': 1, 'negative': 0})
    # return DataFrame with text and label columns
    return pd.DataFrame({'text': texts, 'label': labels})

In [None]:
# load the dataset using the function we defined earlier
data = load_imdb_data('IMDB Dataset.csv')
# split the data into train and test sets
train, test = train_test_split(data, test_size=0.2, random_state=42)
# reset the index for both train and test DataFrames
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
print(train[:10])

### preprocessing 

Clean the text

In [None]:
def clean(text):
    # Convert to lowercase
    text = text.lower() 
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters, punctuation, and emojis
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'_+', '', text)
    text = re.sub(r'<br\s*\/?>', '', text)
    text = text.replace('br', '')
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove extra white spaces, tabs, and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove language-specific stopwords
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    words = re.findall(r'\w+', text)
    words = [word for word in words if word not in stopwords]
    # Stemming
    words = [wnl.lemmatize(word) for word in words]
    return ' '.join(words)


In [None]:
#Cleaning train and test data
train['text'] = train['text'].apply(lambda x: clean(x))
test['text'] = test['text'].apply(lambda x: clean(x))

In [None]:
print(train[:10])

### word vectorizer

In [None]:
# Transform the text into a sparse matrix of token counts using the CountVectorizer object
vectorizer = CountVectorizer(max_features=10000, ngram_range=(2, 2))
X_train_vectors = vectorizer.fit_transform(train['text'])
X_train_vectors

In [None]:
# Print the first row of the sparse matrix of token counts
X_train_vectors[0]

In [None]:
vectorizer.get_feature_names()[10:30]

#### Bi-grams Analysis

In [None]:
#Generating n-grams
def generate_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    ngrams = list(nltk.ngrams(tokens, n))
    return [' '.join(gram) for gram in ngrams]

In [None]:
train['bigrams'] = train['text'].apply(lambda x: generate_ngrams(x, 2))

In [None]:
#apply the polarity scoring function to each bigram 
analyzer = SentimentIntensityAnalyzer()
train['bigram_sentiment'] = train['bigrams'].apply(lambda x: [analyzer.polarity_scores(gram)['compound'] for gram in x])


In [None]:
#filter out only the positive and negative bigrams  
positive_bigrams = train.explode('bigram_sentiment')[train['label'] == 1]['bigram_sentiment']
negative_bigrams = train.explode('bigram_sentiment')[train['label'] == 0]['bigram_sentiment']
#Calculate the mean sentiment score for each bigram
positive_bigrams_mean = positive_bigrams.groupby(positive_bigrams.index).mean()
negative_bigrams_mean = negative_bigrams.groupby(negative_bigrams.index).mean()
#Print the top 10 positive and negative bigrams, sorted by mean sentiment score.
print('Top 10 positive bigrams:')
print(positive_bigrams_mean.sort_values(ascending=False)[:10])

print('Top 10 negative bigrams:')
print(negative_bigrams_mean.sort_values(ascending=True)[:10])


#### EDA

In [None]:
# define the n-gram range and maximum number of features
ngram_range = (1, 2)
max_features = 10000

# instantiate the count vectorizer with the specified n-gram range and maximum number of features
vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)

# fit and transform the training data
X_train_vectors = vectorizer.fit_transform(train['text'])

# get the feature names
feature_names = vectorizer.get_feature_names()

# get the top occurring n-grams for positive and negative reviews
top_ngrams_pos = pd.Series(X_train_vectors[train['label']==1].sum(axis=0).A1, index=feature_names).sort_values(ascending=False)[:10]
top_ngrams_neg = pd.Series(X_train_vectors[train['label']==0].sum(axis=0).A1, index=feature_names).sort_values(ascending=False)[:10]

# plot the bar chart for top occurring n-grams in positive and negative reviews
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].barh(top_ngrams_pos.index, top_ngrams_pos.values, color='green')
ax[0].set_title('Top 10 N-grams in Positive Reviews')
ax[1].barh(top_ngrams_neg.index, top_ngrams_neg.values, color='red')
ax[1].set_title('Top 10 N-grams in Negative Reviews')
plt.tight_layout()
plt.show()


#### Word Frequency Analysis

In [None]:
all_reviews = ' '.join(train['text'])

# split the string into individual words
all_words = all_reviews.split()

# count the frequency of each word
word_counts = Counter(all_words)

# create a dataframe of the top 20 most common words
top_words = pd.DataFrame(word_counts.most_common(20), columns=['word', 'count'])

# plot the word frequency distribution
top_words.plot(kind='bar', x='word')
plt.title('Top 20 Most Common Words')
plt.xlabel('Word')
plt.ylabel('Count')
plt.show()

### Multinomial Naive Bayes

Training using our training data

In [None]:
#Initialize classifier and Fit the model
model = MultinomialNB()
model.fit(X_train_vectors, train['label'])

In [None]:
#accuracy score on training data
model.score(X_train_vectors, train['label'])

### Testing
Vectorize

In [None]:
#Transform the test data into vectors
X_test_vectors = vectorizer.transform(test['text'])
X_test_vectors

#### evaluate classifier on test data

In [None]:
#Predict the labels of the test data
y_test_hat = model.predict(X_test_vectors)
y_test_hat

#### accuracy score

In [None]:
#accuracy score on test data
accuracy_score(test.label, y_test_hat)

### Results

In [None]:
#classification report on test data
print(classification_report(test['label'], y_test_hat))

In [None]:
#confusion matrix 
print("Confusion Matrix:")
print(confusion_matrix(test.label, y_test_hat))

In [None]:
#confusion matrix using a heatmap
sns.heatmap(pd.DataFrame(confusion_matrix(test.label, y_test_hat)), annot=True, cmap="Purples", fmt="d", cbar=False, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label');