#### Objective: Sentiment Analysis on Neighborhood Review Data

##### Step 1: Load required libraries/ modeules

In [1]:
# pip install transformers

In [2]:
# pip install torch

In [3]:
# import packages
import pandas as pd

import re
import pandas as pd
from typing import List

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download the set of stop words
# nltk.download('stopwords')
# nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

from transformers import pipeline
import torch

##### Step 2: Read and setup the dataset and print the details of the first review

In [4]:
file_path = 'data/4_sentimentAnalysis/Neighborhood_Review.csv'
# Read the CSV file into a DataFrame.
df = pd.read_csv(file_path)
# Print the dimensions of the DataFrame
print(df.shape)
df.head(1)

(207, 6)


Unnamed: 0,userID,userType,createdTime,categories,rating,review
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...


##### Step 3: Creatig ground truth based on the rating
The rating is in the range of [1, 5], and we convert it into -1 (1 or 2 stars), 0 (3 stars) and 1 (4 or 5 stars).

In [5]:
# Function to convert rating to ground_truth
def convert_rating(rating):
    if rating in [1, 2]:
        return -1
    elif rating == 3:
        return 0
    elif rating in [4, 5]:
        return 1

# Apply this function to the 'rating' column to create a new 'ground_truth' column
df['ground_truth'] = df['rating'].apply(convert_rating)
df.head(5)

Unnamed: 0,userID,userType,createdTime,categories,rating,review,ground_truth
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...,0
1,0c0ff133-4b8b-40a0-ba9d-bbf05eb9d8db,Current Resident,2017-03-28T20:01:57.185719Z,Overall Experience,4,I have lived here for over 8 years. I basicall...,1
2,5f2ee6d6-f1ad-4350-b0b1-bc8ba99177bb,Current Resident,2017-03-15T00:18:32.329913Z,Overall Experience,5,I've lived here for almost two years and love ...,1
3,04ee54e1-8876-4b1e-8d99-336782962398,Current Resident,2017-02-21T16:13:09.569592Z,Overall Experience,5,I was born and raised in this neighborhood. Th...,1
4,5ebe6d64-83d8-49f9-bb46-d3de859bb43d,Current Resident,2017-02-01T22:24:32.521606Z,Overall Experience,5,The neighborhood I live in is very calm;not to...,1


##### Step 4: Preprocess the data

In [6]:
# Function to preprocess text
def preprocess_text(text: str) -> str:
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    # Remove abbreviation
    text = re.sub(r'\b(?:[a-zA-Z]\.){2,}', '', text)
    return text

In [7]:
df['processed_review'] = df['review'].apply(preprocess_text)

#print(df[['review', 'processed_review']])
df.head(15)

Unnamed: 0,userID,userType,createdTime,categories,rating,review,ground_truth,processed_review
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...,0,the community is changing with lots of diversi...
1,0c0ff133-4b8b-40a0-ba9d-bbf05eb9d8db,Current Resident,2017-03-28T20:01:57.185719Z,Overall Experience,4,I have lived here for over 8 years. I basicall...,1,i have lived here for over years i basically ...
2,5f2ee6d6-f1ad-4350-b0b1-bc8ba99177bb,Current Resident,2017-03-15T00:18:32.329913Z,Overall Experience,5,I've lived here for almost two years and love ...,1,ive lived here for almost two years and love h...
3,04ee54e1-8876-4b1e-8d99-336782962398,Current Resident,2017-02-21T16:13:09.569592Z,Overall Experience,5,I was born and raised in this neighborhood. Th...,1,i was born and raised in this neighborhood the...
4,5ebe6d64-83d8-49f9-bb46-d3de859bb43d,Current Resident,2017-02-01T22:24:32.521606Z,Overall Experience,5,The neighborhood I live in is very calm;not to...,1,the neighborhood i live in is very calmnot too...
5,9eed36d9-97a1-4bc8-be77-7d88c6995632,Current Resident,2017-01-11T22:13:59.439074Z,Overall Experience,5,"I love my neighborhood that is edgy and now, d...",1,i love my neighborhood that is edgy and now di...
6,e06aadd9-20a9-4f6c-9081-662e88488b67,Current Resident,2016-12-04T14:26:07.351541Z,Overall Experience,2,"In Bedford- Stuyvesant, I appreciate that many...",-1,in bedford stuyvesant i appreciate that many p...
7,8c0e0ecd-49e0-4501-9c76-8889e1543c5e,Current Resident,2016-11-23T18:30:22.405976Z,Overall Experience,3,There has already been 2 shootings on my block...,0,there has already been shootings on my block ...
8,3f475785-83e0-4d3b-9267-96a4fe0a1414,Former Resident,2016-10-25T17:30:50.394406Z,Overall Experience,4,There is a lot of things to see at Bedford-Stu...,1,there is a lot of things to see at bedfordstuy...
9,ad77ae98-9f7a-4790-a6db-940506d1160e,Current Resident,2016-10-21T18:29:28.691208Z,Overall Experience,4,I have lived in this area for three years now....,1,i have lived in this area for three years now ...


##### Step 5: Remove the stop words

In [8]:
# Load stop words
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

# Apply the function to remove stop words from 'review content'
df['review_no_stopwords'] = df['processed_review'].apply(remove_stopwords)
df.head(5)

Unnamed: 0,userID,userType,createdTime,categories,rating,review,ground_truth,processed_review,review_no_stopwords
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...,0,the community is changing with lots of diversi...,community changing lots diversity would like s...
1,0c0ff133-4b8b-40a0-ba9d-bbf05eb9d8db,Current Resident,2017-03-28T20:01:57.185719Z,Overall Experience,4,I have lived here for over 8 years. I basicall...,1,i have lived here for over years i basically ...,lived years basically grew although neighborho...
2,5f2ee6d6-f1ad-4350-b0b1-bc8ba99177bb,Current Resident,2017-03-15T00:18:32.329913Z,Overall Experience,5,I've lived here for almost two years and love ...,1,ive lived here for almost two years and love h...,ive lived almost two years love community grow...
3,04ee54e1-8876-4b1e-8d99-336782962398,Current Resident,2017-02-21T16:13:09.569592Z,Overall Experience,5,I was born and raised in this neighborhood. Th...,1,i was born and raised in this neighborhood the...,born raised neighborhood neighborhood always h...
4,5ebe6d64-83d8-49f9-bb46-d3de859bb43d,Current Resident,2017-02-01T22:24:32.521606Z,Overall Experience,5,The neighborhood I live in is very calm;not to...,1,the neighborhood i live in is very calmnot too...,neighborhood live calmnot noisy plethora surro...


##### Step 6: Perform sentiment analysis
###### (a): Naïve Approach:

In [9]:
# Load custom lexicons
with open('data/4_sentimentAnalysis/positive-words.txt', 'r') as file:
    positive_words = file.read().splitlines()

with open('data/4_sentimentAnalysis/negative-words.txt', 'r') as file:
    negative_words = file.read().splitlines()

In [10]:
# Function to assign sentiment based on word counts
def assign_sentiment(review):
    pos_count = sum(word in positive_words for word in review.split())
    neg_count = sum(word in negative_words for word in review.split())

    if pos_count > neg_count:
        return 1
    elif neg_count > pos_count:
        return -1
    else:
        return 0

# Apply the function to the dataframe
df['Sentiment'] = df['review_no_stopwords'].apply(assign_sentiment)

# Display the dataframe with the new Sentiment column
df.head()

Unnamed: 0,userID,userType,createdTime,categories,rating,review,ground_truth,processed_review,review_no_stopwords,Sentiment
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...,0,the community is changing with lots of diversi...,community changing lots diversity would like s...,1
1,0c0ff133-4b8b-40a0-ba9d-bbf05eb9d8db,Current Resident,2017-03-28T20:01:57.185719Z,Overall Experience,4,I have lived here for over 8 years. I basicall...,1,i have lived here for over years i basically ...,lived years basically grew although neighborho...,1
2,5f2ee6d6-f1ad-4350-b0b1-bc8ba99177bb,Current Resident,2017-03-15T00:18:32.329913Z,Overall Experience,5,I've lived here for almost two years and love ...,1,ive lived here for almost two years and love h...,ive lived almost two years love community grow...,1
3,04ee54e1-8876-4b1e-8d99-336782962398,Current Resident,2017-02-21T16:13:09.569592Z,Overall Experience,5,I was born and raised in this neighborhood. Th...,1,i was born and raised in this neighborhood the...,born raised neighborhood neighborhood always h...,1
4,5ebe6d64-83d8-49f9-bb46-d3de859bb43d,Current Resident,2017-02-01T22:24:32.521606Z,Overall Experience,5,The neighborhood I live in is very calm;not to...,1,the neighborhood i live in is very calmnot too...,neighborhood live calmnot noisy plethora surro...,1


###### (b) Using HuggingFace (or any other package to implement machine learning techniques for sentiment analysis) and assign a sentiment label to each review

In [11]:
# Load the sentiment analysis pipeline
# sentiment_pipeline = pipeline('sentiment-analysis')
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)

def get_sentiment(review):
    result = sentiment_pipeline(review)
    label = result[0]['label']
    if label == 'POSITIVE':
        return 1
    elif label == 'NEGATIVE':
        return -1
    else:
        return 0

# Apply the function to the dataframe
df['ML_Sentiment'] = df['review'].apply(get_sentiment)

In [12]:
df.head(2)

Unnamed: 0,userID,userType,createdTime,categories,rating,review,ground_truth,processed_review,review_no_stopwords,Sentiment,ML_Sentiment
0,70017df6-e055-4776-8c48-7347a2b2be44,Current Resident,2017-05-02T17:05:51.560792Z,Overall Experience,3,The community is changing with lots of diversi...,0,the community is changing with lots of diversi...,community changing lots diversity would like s...,1,1
1,0c0ff133-4b8b-40a0-ba9d-bbf05eb9d8db,Current Resident,2017-03-28T20:01:57.185719Z,Overall Experience,4,I have lived here for over 8 years. I basicall...,1,i have lived here for over years i basically ...,lived years basically grew although neighborho...,1,1


##### Step 6: Finally, evaluate results by comparing labels with the “ground truth” labels

In [13]:
# To check the accuracy
correct_predictions = (df['Sentiment'] == df['ground_truth']).sum()
total_predictions = len(df)
accuracy = (correct_predictions / total_predictions) * 100

print(f'Accuracy from Naive approach: {accuracy:.2f}')

Accuracy from Naive approach: 43.96


In [14]:
# To check the accuracy - ML approach
correct_predictions = (df['ML_Sentiment'] == df['ground_truth']).sum()
total_predictions = len(df)
accuracy = (correct_predictions / total_predictions) * 100

print(f'Accuracy from ML approach: {accuracy:.2f}')

Accuracy from ML approach: 36.23


###### With the present approach I believe the Naive approach is better than the ML approach. However, both are pretty bad as the accuracy is just around 40% in either case. Lets try cleaning the data a bit and find a better approach

In [15]:
# Apply the function to review with stopwords removed
df['ML_Sentiment_no_stopwords'] = df['review_no_stopwords'].apply(get_sentiment)

# To check the accuracy - ML approach
correct_predictions = (df['ML_Sentiment_no_stopwords'] == df['ground_truth']).sum()
total_predictions = len(df)
accuracy = (correct_predictions / total_predictions) * 100

print(f'Accuracy from ML approach with stopwords removed dataframe: {accuracy:.2f}')

Accuracy from ML approach with stopwords removed dataframe: 34.78


###### Again a disaster :(
###### I think ML might need more datapoints probably to conduct the analysis. So, if there is no limitation on system computational speed and the data is huge then Neural Network would work better and also if the ML's hyper parameter can be fine tuned.
###### Naive did better may be because we used domain-specific lexicon that was supplied or say the lexicon was in allignment with the sentiment expression in the given text whereas ML uses a more generalised approach and is not domain specific and might not understand these terms in the context of the reviews.