# Final Project Applying NLP

## Project Description: 
Sentiment analysis: the sentiment of the textual data will be analyzed and classified into at least 3 classes.

### 1. Data Collection 

#### 1.1 Collect a dataset of product reviews

- Source data: https://www.kaggle.com/code/mehmetisik/rating-product-sorting-reviews-in-amazon/input

#### 1.2 Annotate the dataset

- With labels of positive, negative or neutral sentiment, based on
collected data

In [18]:
import pandas as pd

# Load the CSV file
file_path = "./data/reviews_aws/amazon_review.csv"
try:
    data = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Verify that the 'overall' column exists and is numeric
if 'overall' not in data.columns:
    print("Error: 'overall' column is missing in the dataset.")
    exit()
if not pd.api.types.is_numeric_dtype(data['overall']):
    print("Error: 'overall' column must contain numeric data.")
    exit()

# Function to label sentiment
def label_sentiment(overall):
    if overall >= 4:
        return "Positive"
    elif overall == 3:
        return "Neutral"
    else:
        return "Negative"

# Add sentiment labels
data['sentiment'] = data['overall'].apply(label_sentiment)

# Save the updated file
output_path = "./data/reviews_aws/amazon_review_labeled.csv"
data.to_csv(output_path, index=False)

# Print confirmation
print(f"Labeled file saved at: {output_path}")
data.head()  # Display the first few rows of the updated dataset



Labeled file saved at: ./data/reviews_aws/amazon_review_labeled.csv


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote,sentiment
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0,Positive
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0,Positive
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0,Positive
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0,Positive
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0,Positive


### 2 Preprocessing

- Perform necessary text preprocessing steps such as tokenization, stop-word removal,
stemming/lemmatization, and lowercasing. (10)
- Remove any irrelevant columns, handle missing values, and clean text data by removing
special characters, stopwords, and performing stemming/lemmatization.
- Handle specific challenges of used text like hashtags, emojis, and slang.

In [19]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /Users/marieth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marieth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/marieth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/marieth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):  # Check if the value is not a string
        return ""  # Return an empty string or handle it as needed
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

# Fill NaN values in the 'reviewText' column
data['reviewText'] = data['reviewText'].fillna("")

# Apply the preprocessing function
data['processed_review'] = data['reviewText'].apply(preprocess_text)

# Check the first few rows
data.head()



Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote,sentiment,processed_review
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0,Positive,issue
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0,Positive,purchased device worked advertised never much ...
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0,Positive,work expected sprung higher capacity think mad...
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0,Positive,think worked greathad diff bran 64gb card went...
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0,Positive,bought retail packaging arrived legit orange e...


In [21]:
# Drop irrelevant columns
irrelevant_columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'unixReviewTime', 
                      'reviewTime', 'day_diff', 'summary', 'helpful_yes', 'total_vote']
data = data.drop(columns=irrelevant_columns, errors='ignore')


In [22]:
# Count missing values in specific columns
missing_values = data[['processed_review', 'overall']].isnull().sum()

# Display the result
print("Missing values in each column:")
print(missing_values)


Missing values in each column:
processed_review    0
overall             0
dtype: int64


In [23]:
# Fill missing values
data['reviewText'] = data['reviewText'].fillna("")  # Replace NaN in 'reviewText'
data['overall'] = data['overall'].fillna(data['overall'].median())  # Replace NaN in 'overall'
data.head()


Unnamed: 0,reviewText,overall,sentiment,processed_review
0,No issues.,4.0,Positive,issue
1,"Purchased this for my device, it worked as adv...",5.0,Positive,purchased device worked advertised never much ...
2,it works as expected. I should have sprung for...,4.0,Positive,work expected sprung higher capacity think mad...
3,This think has worked out great.Had a diff. br...,5.0,Positive,think worked greathad diff bran 64gb card went...
4,"Bought it with Retail Packaging, arrived legit...",5.0,Positive,bought retail packaging arrived legit orange e...


In [24]:
# Count missing values in specific columns
missing_values = data[['processed_review', 'overall']].isnull().sum()

# Display the result
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
processed_review    0
overall             0
dtype: int64


In [25]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    if not isinstance(text, str):  # Handle non-string values
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

# Apply text preprocessing to 'reviewText'
data['processed_review'] = data['reviewText'].apply(preprocess_text)
####


In [26]:
# Check for missing values
print(data.isnull().sum())

# Sample cleaned data
print(data[['processed_review', 'overall', 'sentiment']].sample(5))


reviewText          0
overall             0
sentiment           0
processed_review    0
dtype: int64
                                       processed_review  overall sentiment
2     work expect sprung higher capac think made bit...      4.0  Positive
2732  work advert load fast problem pull adapt load ...      4.0  Positive
4733  product arriv time expect work exactli adverti...      5.0  Positive
2315  ill never need anotherbigg memori card fast pl...      4.0  Positive
1869  els say sandisk memori card use almost exclus ...      5.0  Positive


In [27]:
from wordsegment import load, segment

# Load wordsegment model
load()

def handle_hashtags(text):
    # Identify hashtags
    hashtags = re.findall(r"#\w+", text)
    for hashtag in hashtags:
        # Split hashtag into words
        words = " ".join(segment(hashtag[1:]))  # Remove '#' and segment
        text = text.replace(hashtag, words)
    return text


In [28]:
import emoji

def handle_emojis(text):
    # Replace emojis with their descriptions (e.g., 😊 -> ":smiling_face_with_smiling_eyes:")
    text = emoji.demojize(text, delimiters=(" ", " "))
    return text


In [29]:
# Example slang dictionary
slang_dict = {
    "u": "you",
    "ur": "your",
    "omg": "oh my god",
    "btw": "by the way",
    "idk": "I don't know",
}

def handle_slang(text):
    # Replace slang words with their standard equivalents
    words = text.split()
    replaced_words = [slang_dict.get(word.lower(), word) for word in words]
    return " ".join(replaced_words)


In [30]:
def preprocess_text_with_challenges(text):
    if not isinstance(text, str):
        return ""
    
    # Handle hashtags
    text = handle_hashtags(text)
    
    
    # Handle emojis
    text = handle_emojis(text)
    
    # Handle slang
    text = handle_slang(text)
    
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r"[^a-z\s]", "", text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return " ".join(processed_tokens)

# Apply the combined preprocessing to 'reviewText'
print(data.head)
data['processed_review'] = data['reviewText'].apply(preprocess_text_with_challenges)
data.head()


<bound method NDFrame.head of                                              reviewText  overall sentiment  \
0                                            No issues.      4.0  Positive   
1     Purchased this for my device, it worked as adv...      5.0  Positive   
2     it works as expected. I should have sprung for...      4.0  Positive   
3     This think has worked out great.Had a diff. br...      5.0  Positive   
4     Bought it with Retail Packaging, arrived legit...      5.0  Positive   
...                                                 ...      ...       ...   
4910  I bought this Sandisk 16GB Class 10 to use wit...      1.0  Negative   
4911  Used this for extending the capabilities of my...      5.0  Positive   
4912  Great card that is very fast and reliable. It ...      5.0  Positive   
4913  Good amount of space for the stuff I want to d...      5.0  Positive   
4914  I've heard bad things about this 64gb Micro SD...      5.0  Positive   

                                 

Unnamed: 0,reviewText,overall,sentiment,processed_review
0,No issues.,4.0,Positive,issu
1,"Purchased this for my device, it worked as adv...",5.0,Positive,purchas devic work advertis never much phone m...
2,it works as expected. I should have sprung for...,4.0,Positive,work expect sprung higher capac think made bit...
3,This think has worked out great.Had a diff. br...,5.0,Positive,think work greathad diff bran gb card went sou...
4,"Bought it with Retail Packaging, arrived legit...",5.0,Positive,bought retail packag arriv legit orang envelop...


In [31]:
# Display some processed examples
print(data[['reviewText', 'processed_review']].head())

# Save the updated dataset
output_path = "./data/reviews_aws/amazon_review_preprocessed.csv"
data.to_csv(output_path, index=False)
print(f"Processed data saved at: {output_path}")


                                          reviewText  \
0                                         No issues.   
1  Purchased this for my device, it worked as adv...   
2  it works as expected. I should have sprung for...   
3  This think has worked out great.Had a diff. br...   
4  Bought it with Retail Packaging, arrived legit...   

                                    processed_review  
0                                               issu  
1  purchas devic work advertis never much phone m...  
2  work expect sprung higher capac think made bit...  
3  think work greathad diff bran gb card went sou...  
4  bought retail packag arriv legit orang envelop...  
Processed data saved at: ./data/reviews_aws/amazon_review_preprocessed.csv


### 3. Feature Extraction and Model Comparison

- Explore different feature representation methods such as bag-of-words, TF-IDF, word
embeddings (e.g., Word2Vec or GloVe), or contextual embeddings (e.g., BERT or GPT).
Experiment with 3 different feature extraction techniques to capture meaningful
representations of social media text where the 3 techniques should be of different
word embedding categories

##### 1. TF-IDF Representation

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features

# Fit and transform the 'processed_review' column
tfidf_features = tfidf_vectorizer.fit_transform(data['processed_review'].fillna('')).toarray()

# Convert to DataFrame for better visualization
import pandas as pd
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())

# Add TF-IDF features to the original dataset (optional)
data = pd.concat([data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


##### 2. Word2Vec Representation

In [33]:
from gensim.models import Word2Vec

# Prepare tokenized sentences
#test 
tokenized_sentences = data['processed_review'].fillna('').apply(lambda x: x.split())

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4)

# Generate sentence embeddings by averaging word vectors
def get_sentence_embedding(sentence):
    vectors = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if len(vectors) > 0:
        return sum(vectors) / len(vectors)
    else:
        return [0] * 100  # Vector size

data['word2vec_embedding'] = tokenized_sentences.apply(get_sentence_embedding)


##### 3. BERT Representation

In [2]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    # Use the CLS token embedding (first token) as the sentence representation
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().numpy()
    return cls_embedding

# Apply BERT embedding to each review
data['bert_embedding'] = data['processed_review'].fillna('').apply(get_bert_embedding)


ModuleNotFoundError: No module named 'transformers'