<a href="https://colab.research.google.com/github/Lahari86/NLP/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install nltk scikit-learn




import libraries and load the data

In [None]:
import nltk
from nltk.corpus import movie_reviews
import random
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#download movie_reviews data
nltk.download('movie_reviews')

#load movie_reviews data
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Data preprocessing and Feature extraction

In [None]:
def preprocess_text(words):

  #remove punctuation marks and covert to lowecase
  words = [word.lower() for word in words if word.isalpha()]
  return ' '.join(words)


#preprocess the documents
documents = [(preprocess_text(doc), category) for doc, category in documents]

#separate the data and labels
data, labels = zip(*documents)

#initialize countVectorizer
vectorizer = CountVectorizer(max_features=2500)

#fit and transform the data
X = vectorizer.fit_transform(data)


#convert the labels to a numerical format
y = [1 if label == 'pos' else 0 for label in labels]

Split the data into train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

Model Training

In [None]:
#initialize the classifier
classifier = MultinomialNB()

#train the classifier
classifier.fit(X_train, y_train)

Evaluate the model

In [None]:
#make predictions on the test set
y_pred = classifier.predict(X_test)

#accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is:", accuracy*100)

#classification report
print(classification_report(y_test, y_pred, target_names=['neg' , 'pos']))

Accuracy is: 79.75
              precision    recall  f1-score   support

         neg       0.83      0.78      0.80       212
         pos       0.77      0.81      0.79       188

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



In [22]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}

# Initialize classifier
classifier = MultinomialNB()

# Initialize GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to training data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Get best classifier
best_classifier = grid_search.best_estimator_

# Evaluate best classifier on test set
y_pred = best_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


Best hyperparameters: {'alpha': 0.5}
Accuracy: 79.75
