In [1]:
from nltk.tokenize import word_tokenize
#Install necessary libraries
!pip install nltk pandas numpy scikit-learn

#Importing  libraries
import nltk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

#Downloading  NLTK datasets for preprocessing of data(reviews)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Load dataset
df = pd.read_csv('DatasetReviewsAndSentiments.csv')

# Initialize Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.remove('not')  # Ensure 'not' is not removed

# Preprocess text function
def PreProcessText(review):
    # Tokenize and convert to lowercase
    tokens = word_tokenize(review.lower())
    # Remove stopwords and non-alphanumeric tokens, and lemmatize(converting it in to its base form)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

df['Review'] = df['Review'].apply(PreProcessText)

# Vectorize the text data with bi-grams and tri-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # Consider unigrams, bi-grams, and tri-grams
X = vectorizer.fit_transform(df['Review'])
y = df['Label']
print(y.value_counts())
# Split data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Define the model
model = MultinomialNB()




# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 0.7],
    'fit_prior': [True, False]
}

#Set up GridSearchCV with stratified k-fold cross-validation
stratified_kfold = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(model, param_grid, cv=stratified_kfold, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

#Best parameters found by using GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

#Now project will make predictions
y_pred = grid_search.predict(X_test)

# Evaluate the model using different metrics
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"Confusion Matrix:\n{cm}")

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))





Label
positive    2791
negative    2226
Name: count, dtype: int64
Best parameters: {'alpha': 0.7, 'fit_prior': False}
Accuracy: 0.8623188405797102
Confusion Matrix:
[[653  82]
 [146 775]]
Precision: 0.8656829293427701
Recall: 0.8623188405797102
F1 Score: 0.8627129866518513

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.89      0.85       735
    positive       0.90      0.84      0.87       921

    accuracy                           0.86      1656
   macro avg       0.86      0.86      0.86      1656
weighted avg       0.87      0.86      0.86      1656



In [3]:
def predict_sentiment(review):
    review_processed = PreProcessText(review)
    review_vectorized = vectorizer.transform([review_processed])
    prediction = grid_search.predict(review_vectorized)
    return prediction[0]

# Example reviews
reviews = [
    "This app is amazing! I've never had any issues and it works perfectly.",
    "Scam! They charged me without my consent and the app doesn't even work.",
    "Great app for tracking my workouts. Highly recommend it to anyone!",
    "Totally fake! This app is just a clone of another one and it crashes all the time.",
    "I've been using this app for months and it's been very reliable and useful.",
    "Warning! This app stole my personal information. Do not download.",
    "Fantastic user interface and very helpful customer support.",
    "Terrible. It's filled with ads and doesn't do what it promises.",
    "Love the new features in the latest update. Well done!",
    "Fake reviews everywhere. This app is a complete fraud."
]

# Input by user to detect the fraud app
for review in reviews:
    predicted_sentiment = predict_sentiment(review)
    print(f"Review: {review}")
    print(f"Predicted sentiment for the review: {predicted_sentiment}")
    if predicted_sentiment == "negative":
      print("App is Fraud")
    else:
      print("App is Not Fraud")
new_review = input("Enter a review to detect its sentiment: ")
predicted_sentiment = predict_sentiment(new_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")
# Check sentiment and print appropriate message
if predicted_sentiment == "negative":
    print("App is Fraud")
else:
    print("App is Not Fraud")

Review: This app is amazing! I've never had any issues and it works perfectly.
Predicted sentiment for the review: positive
App is Not Fraud
Review: Scam! They charged me without my consent and the app doesn't even work.
Predicted sentiment for the review: negative
App is Fraud
Review: Great app for tracking my workouts. Highly recommend it to anyone!
Predicted sentiment for the review: positive
App is Not Fraud
Review: Totally fake! This app is just a clone of another one and it crashes all the time.
Predicted sentiment for the review: negative
App is Fraud
Review: I've been using this app for months and it's been very reliable and useful.
Predicted sentiment for the review: positive
App is Not Fraud
Predicted sentiment for the review: negative
App is Fraud
Review: Fantastic user interface and very helpful customer support.
Predicted sentiment for the review: positive
App is Not Fraud
Review: Terrible. It's filled with ads and doesn't do what it promises.
Predicted sentiment for the r

Predicted sentiment for the review: positive
App is Not Fraud


In [4]:
# Display TF-IDF scores for the new review
def display_tfidf_scores(review):
    review_processed = PreProcessText(review)
    review_vectorized = vectorizer.transform([review_processed])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = review_vectorized.toarray().flatten()
    tfidf_dict = {feature_names[i]: tfidf_scores[i] for i in range(len(feature_names)) if tfidf_scores[i] > 0}
    tfidf_dict_sorted = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))

    print(f"TF-IDF scores for review: '{review}'")
    for word, score in tfidf_dict_sorted.items():
       print(f"{word}: {score}")

display_tfidf_scores(new_review)


TF-IDF scores for review: 'This app is amazing! I've never had any issues and it works perfectly.'
amazing never: 0.42260475664382485
never issue: 0.4031939637087935
work perfectly: 0.4031939637087935
app amazing: 0.3562388172507825
perfectly: 0.3294483833378078
never: 0.2842342616841985
amazing: 0.2559476776993183
issue: 0.2286340374220216
work: 0.21990577470409636
app: 0.11790564307329514


In [5]:
import joblib

# Define the file names for saving
vectorizer_filename = 'tfidf_vectorizer.joblib'
model_filename = 'sentiment_model.joblib'

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, vectorizer_filename)
print(f"Vectorizer saved to {vectorizer_filename}")

# Save the trained GridSearchCV model
joblib.dump(grid_search, model_filename)
print(f"Model saved to {model_filename}")

Vectorizer saved to tfidf_vectorizer.joblib
Model saved to sentiment_model.joblib
