In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import pos_tag
import os

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [19]:
# Read the CSV file
csv_files_path = "./../data/CISC 351 Airbnb Data/reviews/"

dfs = []

# Loop through each file in the folder
for file in os.listdir(csv_files_path):
    if file.endswith('.csv') and file != 'processed_reviews.csv' :
        print(file)
        # Read each CSV file into a DataFrame
        combined_df = pd.read_csv(os.path.join(csv_files_path, file))
        # Append the DataFrame to the list
        dfs.append(combined_df)

# Concatenate all DataFrames in the list into one
combined_df = pd.concat(dfs, ignore_index=True)

montreal_reviews.csv
nyc_reviews.csv
toronto_reviews.csv


In [22]:
stop_words = set(stopwords.words('english'))

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [23]:
def remove_non_alphanumeric(text):
    if type(text) is str:
        return ''.join(char for char in text if char.isalnum() or char.isspace())

In [24]:
combined_df['processed_comments'] = combined_df['comments'].apply(lambda x: remove_non_alphanumeric(x))
combined_df['processed_comments'] = combined_df['processed_comments'].dropna()

In [25]:
# Function to tokenize a review and remove stop words
def tokenize_and_remove_stopwords(review):
    review = str(review)
    tokens = word_tokenize(review.lower())
    return [token for token in tokens if token.isalpha() and token not in stop_words]

In [26]:
def remove_adjectives(words_list):
    tagged_words = pos_tag(words_list)
    return [word for word, pos in tagged_words if pos != 'JJ']  # Filter out adjectives

In [27]:
filler_words = ['place', 'stay', 'location', 'would', 'everything', 'definitely', 'well', 'us', 'really', 'also', 'time', 'highly', 
                'amazing', 'like', 'needed', 'back', 'city', 'staying', 'made', 'check', 'one', 'lovely', 'get', 'close', 'even',
                'best', 'loved' 'away', 'enjoyed', 'recommend', 'need', 'around', 'away', 'loved', 'felt', 'could', 'super', 'always',
                'perfect', 'stayed', 'kind', 'day', 'night', 'thank', 'airbnb', 'two', 'questions', 'come']

def remove_filler_words(tokens):
    return [token for token in tokens if token not in filler_words]

In [28]:
# # Tokenize and remove stop words for each review
combined_df['tokens'] = combined_df['processed_comments'].apply(tokenize_and_remove_stopwords)
combined_df['tokens'] = combined_df['tokens'].apply(remove_adjectives)
combined_df['tokens'] = combined_df['tokens'].apply(remove_filler_words)
combined_df['tokens'] = combined_df['tokens'].dropna()

In [29]:
combined_df['sentiment'] = combined_df['processed_comments'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

In [30]:
combined_df.to_csv(os.path.join(csv_files_path, "processed_reviews.csv"))

In [31]:
# Define a function to count the occurrences of words in positive and negative reviews
def count_positive_word_sentiment(df, word_counts, threshold):
    for tokens, sentiment in zip(df['tokens'], df['sentiment']):
        if sentiment > threshold:
            for word in tokens: 
                word_counts[word] += abs(sentiment)
    
    return word_counts

def count_negative_word_sentiment(df, word_counts, threshold):
    for tokens, sentiment in zip(df['tokens'], df['sentiment']):
        if sentiment < threshold:
            for word in tokens: 
                word_counts[word] += abs(sentiment)
    
    return word_counts

In [32]:
all_words = []
for tokens in combined_df['tokens']:
    try:
        all_words.extend(tokens)
    except:
        pass

all_words_set = set(all_words)
word_counts = {key: 0 for key in all_words_set}

In [33]:
# Calculate the correlation between each word and sentiment
correlation_matrix = pd.DataFrame(index=all_words, columns=['Positive', 'Negative'])
word_counts = count_positive_word_sentiment(combined_df, word_counts, 0.9)
word_counts = count_negative_word_sentiment(combined_df, word_counts, -0.5) 
sorted_dict = dict(sorted(word_counts.items(), key=lambda item: item[1], reverse=True))

# Print the top 50 items
for idx, (key, value) in enumerate(sorted_dict.items()):
    if idx < 50:
        print(f'{key}: {value}')
    else:
        break

apartment: 261425.24149998635
host: 248231.78019998866
recommend: 171322.036099991
room: 144978.05369999688
home: 131046.1038999964
space: 119425.99969999732
restaurants: 109917.97099999957
walk: 92401.52059999944
area: 91972.14849999748
subway: 90658.99860000069
neighborhood: 80236.05429999971
house: 79906.57359999814
need: 75744.24319999943
bed: 74986.03359999866
around: 70543.6617999995
kitchen: 68773.24629999953
hosts: 67954.31029999875
away: 67950.63509999901
loved: 66966.19689999963
walking: 65617.61029999894
experience: 65038.15929999924
felt: 64816.08089999944
street: 61506.49759999972
could: 59341.888899999496
clean: 58976.251699999295
communication: 58298.457899997564
located: 57240.79559999915
toronto: 57146.3404999982
parking: 55111.700499999315
bathroom: 53352.78390000033
super: 53217.59779999766
always: 52935.85079999933
perfect: 52204.36859999925
night: 51238.09310000061
stayed: 51062.17369999963
distance: 50130.986099999755
kind: 49908.702900000484
family: 49692.3021999