Group Project : Sentiment Analysis on Amazon Reviews

Group Members : Jay Gondalia (1196220)
                
                Zeel Parekh (1196109)



In [None]:
!pip install nltk
!pip install textblob
!pip install wordcloud

In [None]:
# Import necessary libraries
import spacy

from spacy import displacy


from warnings import filterwarnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")





In [None]:
# Download NLTK stopwords
nltk.download('stopwords')




In [None]:
# Load the dataset using Python

df = pd.read_csv('amazon_reviews.csv', nrows=2100)



In [None]:
# Visualization of the Dataset Features
# Remove NaN values from the 'reviewText' column
df = df.dropna(subset=['reviewText'])

# Calculate the length of each reviewText, handling NaN values
df['reviewText_length'] = df['reviewText'].apply(lambda x: len(x) if isinstance(x, str) else np.nan)


# Histogram of title lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['reviewText_length'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of ReviewText length Lengths')
plt.xlabel('ReviewText length Length')
plt.ylabel('Frequency')
plt.show()

# Visualization of NLP Results
# let's visualize the frequency of named entities
named_entities_count = df['reviewText'].apply(len)


In [None]:
# Load SpaCy model
nlp = spacy.load('en_core_web_lg')

In [None]:


def text_preprocessing(df, reviewText):
    # Normalizing Case Folding - Uppercase to Lowercase
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

    # Removing Punctuation
    df['reviewText'] = df['reviewText'].str.replace('[^\w\s]', '')

    # Removing Numbers
    df['reviewText'] = df['reviewText'].str.replace('\d', '')

    # StopWords
    sw = stopwords.words('english')
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

    # Remove Rare Words
    temp_df = pd.Series(' '.join(df['reviewText']).split()).value_counts()
    drops = temp_df[temp_df <= 1]
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in drops))

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df

In [None]:

df = text_preprocessing(df, "reviewText")


Method 1: SpaCy Word Embeddings :

In [None]:
# Working with Word Vectors
def generate_word_vectors(reviewText):
    doc = nlp(reviewText)
    return [token.vector for token in doc]



In [None]:
# Apply SpaCy for word vector generation to the title column
df['word_vectors_reviewText'] = df['reviewText'].apply(generate_word_vectors)



In [None]:
# Display vectors
for i, vector_list in enumerate(df['word_vectors_reviewText']):
    print(f"Word vectors for reviewText {i + 1}:", vector_list)

In [None]:
# Syntax and Structure Analysis
def construct_syntax_trees(reviewText):
    doc = nlp(reviewText)
    trees = [sent.root for sent in doc.sents]
    return trees



In [None]:
# Apply SpaCy for syntax tree construction to the title column
df['syntax_trees_reviewText'] = df['reviewText'].apply(construct_syntax_trees)

In [None]:
# Visualization
def visualize_nlp_results(reviewText):
    doc = nlp(reviewText)

    # Visualize named entities, dependency parse, and custom patterns
    displacy.render(doc, style='ent')
    displacy.render(doc, style='dep')


In [None]:
# Visualize named entities, dependency parse, and custom patterns for the title column
df['reviewText'].apply(visualize_nlp_results)


In [None]:
def text_visulaization(df, reviewText, wordcloud=True):
  # Calculation of Term Frequencies

  tf = df['reviewText'].str.split(expand=True).stack().value_counts().reset_index()
  tf.columns = ["words", "tf"]

  if wordcloud:
    # WordCloud
    text = " ".join(i for i in df['reviewText'])
    wordcloud = WordCloud(max_font_size=100, max_words=1000, background_color="white").generate(text)
    plt.figure(figsize=[10, 10])
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Calculation of Term Frequencies : wordcloud")
    plt.show()
    wordcloud.to_file("wordcloud.png")


In [None]:
text_visulaization(df, "reviewText")

In [None]:
def create_polarity_scores(df, reviewText):
  sia = SentimentIntensityAnalyzer()
  df["polarity_score"] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)["compound"])

create_polarity_scores(df, "reviewText")

df.head()

In [None]:
# Create Lables
def create_label(df, reviewText, sentiment_label):
  sia = SentimentIntensityAnalyzer()
  df['sentiment_label'] = df['reviewText'].apply(lambda x: "pos" if sia.polarity_scores(x)["compound"] > 0 else "neg")
  df['sentiment_label'] = LabelEncoder().fit_transform(df['sentiment_label'])

  X = df['reviewText']
  y = df['sentiment_label']

  return X, y
X, y = create_label(df, "reviewText", "sentiment_label")

In [None]:
# Split Dataset
def split_dataset(dataframe, X, y):
  train_x, test_x, train_y, test_y = train_test_split(X, y, random_state=1)
  return train_x, test_x, train_y, test_y

train_x, test_x, train_y, test_y = split_dataset(df, X, y)

In [None]:
def create_features_count(train_x, test_x):
  # Count Vectors
  vectorizer = CountVectorizer()
  x_train_count_vectorizer = vectorizer.fit_transform(train_x)
  x_test_count_vectorizer = vectorizer.fit_transform(test_x)

  return x_train_count_vectorizer, x_test_count_vectorizer

In [None]:
x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)


Create Model

In [None]:
# Logistic Regression
def crate_model_logistic(train_x, test_x):
  # Count
  x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)
  loj_count = LogisticRegression(solver='lbfgs', max_iter=1000)
  loj_model_count = loj_count.fit(x_train_count_vectorizer, train_y)
  accuracy_count = cross_val_score(loj_model_count, x_test_count_vectorizer, test_y, cv=10).mean()
  print("Accuracy - Count Vectors: %.3f" % accuracy_count)

  return loj_model_count


In [None]:

loj_model_count = crate_model_logistic(train_x, test_x)

In [None]:
def crate_model_randomforest(train_x, test_x):
  # Count
  x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)
  rf_count = RandomForestClassifier()
  rf_model_count = rf_count.fit(x_train_count_vectorizer, train_y)
  accuracy_count = cross_val_score(rf_model_count, x_test_count_vectorizer, test_y, cv=10).mean()
  print("Accuracy - Count Vectors: %.3f" % accuracy_count)

  return rf_model_count


In [None]:

rf_model_count = crate_model_randomforest(train_x, test_x)

In [None]:
def model_tuning_randomforest(train_x, test_x):
  # Count
  x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)
  rf_model_count = RandomForestClassifier(random_state=1)
  rf_params = {
        "max_depth": [2, 5, 8, None],
        "max_features": [2, 5, 8, "sqrt", "log2"],
        "n_estimators": [100, 500, 1000],
        "min_samples_split": [2, 5, 10]
    }

  rf_best_grid = GridSearchCV(rf_model_count, rf_params, cv=10, n_jobs=-1, verbose=False).fit(x_train_count_vectorizer, train_y)
  rf_model_count_final = rf_model_count.set_params(**rf_best_grid.best_params_, random_state=1).fit(x_train_count_vectorizer, train_y)
  accuracy_count = cross_val_score(rf_model_count_final, x_test_count_vectorizer, test_y, cv=10).mean()
  print("Accuracy - Count Vectors: %.3f" % accuracy_count)

  return rf_model_count_final


In [None]:
rf_model_count_final = model_tuning_randomforest(train_x, test_x)

In [None]:
def predict_count(train_x, model, new_comment):
  new_comment= pd.Series(new_comment)
  new_comment = CountVectorizer().fit(train_x).transform(new_comment)
  result = model.predict(new_comment)
  if result==1:
    print("Comment is Positive")
  else:
    print("Comment is Negative")


In [None]:
# Logistic Regression
predict_count(train_x, model=loj_model_count, new_comment="this product is very good but i hate it :)")

In [None]:
# Random Forest
predict_count(train_x, model=rf_model_count, new_comment="this product is very bad :)")

In [None]:
# Sample Review
# new_comment=pd.Series(df["reviewText"].sample(1).values)
print(len(df["reviewText"]))

for index, row in df.iterrows():
    new_comment = row["reviewText"]
    print("\n",new_comment)
    # Review - Random Forest
    predict_count(train_x, model=rf_model_count, new_comment=new_comment)


In [None]:
# Importing SVM classifier
from sklearn.svm import SVC

# Function to create SVM model
def create_model_svm(train_x, test_x):
    # Count Vectors
    x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)

    # Initialize SVM classifier
    svm_model = SVC(kernel='linear')
    svm_model.fit(x_train_count_vectorizer, train_y)

    # Calculate accuracy using cross-validation
    accuracy_count = cross_val_score(svm_model, x_test_count_vectorizer, test_y, cv=10).mean()
    print("Accuracy - Count Vectors (SVM): %.3f" % accuracy_count)

    return svm_model

# Create SVM model
svm_model_count = create_model_svm(train_x, test_x)

# Function for SVM model tuning
def model_tuning_svm(train_x, test_x):
    # Count Vectors
    x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)

    # Initialize SVM classifier
    svm_model = SVC(kernel='linear')

    # Define parameters for grid search
    svm_params = {'C': [0.1, 1, 10, 100],
                  'gamma': [1, 0.1, 0.01, 0.001],
                  'kernel': ['linear', 'rbf']}

    # Perform grid search
    svm_best_grid = GridSearchCV(svm_model, svm_params, cv=10, n_jobs=-1, verbose=False)
    svm_best_grid.fit(x_train_count_vectorizer, train_y)

    # Fit SVM model with best parameters
    svm_model_final = svm_model.set_params(**svm_best_grid.best_params_).fit(x_train_count_vectorizer, train_y)

    # Calculate accuracy using cross-validation
    accuracy_count = cross_val_score(svm_model_final, x_test_count_vectorizer, test_y, cv=10).mean()
    print("Accuracy - Count Vectors (SVM - Tuned): %.3f" % accuracy_count)

    return svm_model_final






In [None]:
# Tune SVM model
svm_model_count_final = model_tuning_svm(train_x, test_x)



In [None]:
# Prediction function for SVM model
def predict_count_svm(train_x, model, new_comment):
    new_comment = pd.Series(new_comment)
    new_comment = CountVectorizer().fit(train_x).transform(new_comment)
    result = model.predict(new_comment)
    if result == 1:
        print("Comment is Positive")
    else:
        print("Comment is Negative")

In [None]:
# Sample Review - SVM
print(new_comment)
predict_count_svm(train_x, model=svm_model_count_final, new_comment=new_comment)

NLP SPACY

In [None]:

from spacy.pipeline.textcat import Config, single_label_cnn_config
from spacy.training.example import Example
from spacy.util import minibatch

In [None]:
nlp = spacy.blank('en')

#Add  the  TextCategorizer  to  the  pipeline
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.add_pipe('textcat', last=True)
else:
    textcat = nlp.get_pipe('textcat')

unique_labels = ['Pozitive','Negative']

for label in unique_labels:
    textcat.add_label(label)


In [None]:

def df_to_spacy_format(df):
    new_data = []
    for index, row in df.iterrows():
        text = row['reviewText']


        # Initialize default_cats with zeros for both labels
        default_cats = {'Pozitive': 0, 'Negative': 0}

        # Set the value corresponding to the sentiment label to 1
        sentiment_label = 'Pozitive' if row['sentiment_label'] == 1 else 'Negative'
        default_cats[sentiment_label] = 1

        # Construct the cats_dict with the updated default_cats
        cats_dict = {"cats": default_cats}

        new_data.append((text, cats_dict))
    return new_data



In [None]:
# Feed  dataframe into the above function
# Feed here
new_data = df_to_spacy_format(df)

#Print the new data
for text, cats_dict in new_data:
    print(f"Text: {text}")
    print(f"Categories: {cats_dict['cats']}")
    print("----------------------------")

In [None]:
import random
random.shuffle(new_data)

# Print the new data after shuffle
for text, cats_dict in new_data:
    print(f"Text: {text}")
    print(f"Categories: {cats_dict['cats']}")
    print("----------------------------")

In [None]:
# in the training dataset
train_size = int(0.7 * len(new_data))

# Split the list into two parts
train_data = new_data[:train_size]
test_data = new_data[train_size:]
train_data

In [None]:
def nn_spacy(num_epochs, batch_size, data_train):

    # Training the text categorization model
    optimizer = nlp.initialize()
    tot_loss = []
    for epoch in range(num_epochs):  # Number of training epochs
        losses = {}
        batches = minibatch(data_train, size=batch_size)
        for batch in batches:
            examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
            nlp.update(examples, drop=0.2, losses=losses, sgd=optimizer)
        print(losses['textcat'])
        tot_loss.append(losses['textcat'])

    return tot_loss, nlp

In [None]:
## Train the neural network with these hyper parameters and plot loss during the training
#number of epochs: 4 and batch size : 4
num_epochs = 4
tot_loss1, nlp1 = nn_spacy(num_epochs,10,train_data)

In [None]:
# Plot loss during training
plt.plot(range(1, num_epochs + 1), tot_loss1, marker='o')
plt.title('Loss During Training')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xticks(range(1, num_epochs + 1))
plt.grid(True)
plt.show()

In [None]:
def find_marked_categories(cat_dict):
    marked_categories = [category for category, value in cat_dict['cats'].items() if value == 1]
    return marked_categories[0]


def predict_and_evaluate(model, test_data):
    correct_predictions = 0
    predictions = []

    for text, true_labels in test_data:
        doc = model(text)
        prediction = doc.cats

        highest_category = max(prediction, key=prediction.get)
        if find_marked_categories(true_labels) == highest_category:
            correct_predictions += 1

    return correct_predictions/len(test_data)


In [None]:
predict_and_evaluate(nlp1,test_data)
