In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("Data\Reviews.csv")

In [12]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [14]:
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nData info:")
print(df.info())

Dataset shape: (568454, 10)

First 5 rows:
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have boug

In [15]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [24]:
df = df.dropna(subset=['ProfileName', 'Summary'])

In [25]:
df.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [None]:
df['sentiment'] = df['Score'].apply(lambda x: 'negative' if x <= 2 else ('neutral' if x == 3 else 'positive'))


In [27]:
print("\nSentiment distribution:")
print(df['sentiment'].value_counts())
print(df['sentiment'].value_counts(normalize=True))


Sentiment distribution:
sentiment
positive    443756
negative     82007
neutral      42638
Name: count, dtype: int64
sentiment
positive    0.780709
negative    0.144277
neutral     0.075014
Name: proportion, dtype: float64


In [28]:
def preprocess_text(text):
    # Converting to lowercase
    text = text.lower()
    
    # Removing special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenizing the text
    tokens = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Joining tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [29]:
df['processed_text'] = df['Text'].apply(preprocess_text)


In [30]:
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment_code'] = df['sentiment'].map(sentiment_map)

In [31]:
#Splitting Data into test and train
X = df['processed_text']
y = df['sentiment_code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [33]:
# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)


In [34]:
y_pred_lr = lr_model.predict(X_test_tfidf)

In [35]:
#Evaluating the Model
print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['negative', 'neutral', 'positive']))


Logistic Regression Model Evaluation:
Accuracy: 0.8652

Confusion Matrix:
[[10952   710  4739]
 [ 1859  1641  5028]
 [ 2086   899 85767]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.67      0.70     16401
     neutral       0.50      0.19      0.28      8528
    positive       0.90      0.97      0.93     88752

    accuracy                           0.87    113681
   macro avg       0.71      0.61      0.64    113681
weighted avg       0.84      0.87      0.85    113681



In [36]:
#Training Linear SVM model
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [37]:
y_pred_svm = svm_model.predict(X_test_tfidf)

In [38]:
#Evaluating the model
print("\nLinear SVM Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['negative', 'neutral', 'positive']))


Linear SVM Model Evaluation:
Accuracy: 0.8653

Confusion Matrix:
[[11061   303  5037]
 [ 2024   986  5518]
 [ 2042   386 86324]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.67      0.70     16401
     neutral       0.59      0.12      0.19      8528
    positive       0.89      0.97      0.93     88752

    accuracy                           0.87    113681
   macro avg       0.74      0.59      0.61    113681
weighted avg       0.85      0.87      0.84    113681



In [39]:
#Saving Vocabulary and models
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(lr_model, 'sentiment_lr_model.joblib')
joblib.dump(svm_model, 'sentiment_svm_model.joblib')

['sentiment_svm_model.joblib']

In [40]:
# Function to analyze sentiment of new reviews
def analyze_sentiment(review_text, model=lr_model):
    """
    Analyze the sentiment of a product review
    
    Parameters:
    review_text (str): The product review text
    model: The sentiment analysis model to use
    
    Returns:
    str: Sentiment prediction ('negative', 'neutral', or 'positive')
    """
    # Preprocessing the review
    processed_review = preprocess_text(review_text)
    
    # Converting to TF-IDF features
    review_tfidf = tfidf_vectorizer.transform([processed_review])
    
    # Predicting sentiment
    sentiment_code = model.predict(review_tfidf)[0]
    
    # Mapping back to sentiment label
    sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
    sentiment = sentiment_labels[sentiment_code]
    
    return sentiment


Example of Sentiment Analysis for new Reviews

In [41]:
print("\nExample of sentiment analysis for new reviews:")
example_reviews = [
    "This product is amazing! It works perfectly and exceeded all my expectations.",
    "It's okay but not worth the price. There are better alternatives available.",
    "Terrible product. Broke after two days of use. Don't waste your money."
]



Example of sentiment analysis for new reviews:


In [42]:
for review in example_reviews:
    sentiment = analyze_sentiment(review)
    print(f"\nReview: {review}")
    print(f"Predicted sentiment: {sentiment}")


Review: This product is amazing! It works perfectly and exceeded all my expectations.
Predicted sentiment: positive

Review: It's okay but not worth the price. There are better alternatives available.
Predicted sentiment: neutral

Review: Terrible product. Broke after two days of use. Don't waste your money.
Predicted sentiment: negative


In [None]:
# Plotting the most common words for each sentiment
def plot_most_common_words(df, sentiment, top_n=20):
    # Getting reviews for the specified sentiment
    sentiment_reviews = df[df['sentiment'] == sentiment]['processed_text']
    
    # Combining all reviews into a single string
    all_words = ' '.join(sentiment_reviews).split()
    
    # Counting word frequencies
    word_freq = pd.Series(all_words).value_counts().head(top_n)
    
    # Plotting
    plt.figure(figsize=(12, 6))
    sns.barplot(x=word_freq.values, y=word_freq.index)
    plt.title(f'Top {top_n} Most Common Words in {sentiment.capitalize()} Reviews')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.tight_layout()
    plt.savefig(f'{sentiment}_common_words.png')
    plt.close()

In [1]:
plot_most_common_words(df, 'positive')
plot_most_common_words(df, 'neutral')
plot_most_common_words(df, 'negative')

NameError: name 'plot_most_common_words' is not defined