In [1]:
import pandas as pd
import os

# Read CSV file
file_path1 = 'Reviews_London.csv'  # Replace with your file path
raw_df_reviews = pd.read_csv(file_path1)

In [2]:
import re
import string

# Step 1: Drop unnecessary columns
columns_to_drop = ['id', 'date','reviewer_id','reviewer_name']
df = raw_df_reviews.drop(columns=columns_to_drop)

df['comments'] = df['comments'].astype(str)

In [None]:
# Step 2: Define a function to clean the reviews
def clean_reviews(review):
    # Remove numbers
    review = re.sub(r'\d+', '', review)
    
    # Remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))
    
    # Remove currency symbols
    review = re.sub(r'[\$\£\€\¥]', '', review)
    
    # Remove emojis (using unicode ranges for emojis)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    review = emoji_pattern.sub(r'', review)
    
    # Strip leading and trailing whitespaces
    review = review.strip()
    
    return review

# Step 3: Apply the cleaning function to the 'reviews' column
df['comments'] = df['comments'].apply(clean_reviews)

# Step 4: Drop rows with empty reviews after cleaning
df = df[df['comments'] != '']

In [4]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [5]:
from textblob import TextBlob
import pandas as pd
import re
import string

# Step 1: Define a function to analyze sentiment using TextBlob
def analyze_sentiment(review):
    if review.strip() == '':  # Handle empty reviews
        return None
    
    blob = TextBlob(review)
    
    # Sentiment polarity ranges from -1 (negative) to 1 (positive)
    return blob.sentiment.polarity

# Step 2: Apply the sentiment analysis function to the 'reviews' column
df['sentiment'] = df['comments'].apply(analyze_sentiment)

# Step 3: Drop rows where reviews are empty or sentiment is None
df = df.dropna(subset=['sentiment'])

In [6]:
import pandas as pd
from textblob import TextBlob
import numpy as np

# Stap 1: Groepeer per 'listing_id' en bereken het gemiddelde sentiment en aantal reviews
aggregated_data = df.groupby('listing_id').agg(
    avg_sentiment=('sentiment', 'mean'),  # Gemiddelde sentiment per listing
    num_reviews=('comments', 'count')      # Aantal reviews per listing
).reset_index()

# Stap 2: Pas logaritmische normalisatie toe op het aantal reviews
aggregated_data['log_normalized_reviews'] = np.log1p(aggregated_data['num_reviews'])

# Stap 3: Vermenigvuldig de log-genormaliseerde reviews met het gemiddelde sentiment
aggregated_data['weighted_sentiment'] = aggregated_data['avg_sentiment'] * aggregated_data['log_normalized_reviews']


In [5]:
aggregated_data.head()

NameError: name 'aggregated_data' is not defined

In [7]:
weighted_sentiment = aggregated_data[['listing_id','weighted_sentiment']]

In [8]:
weighted_sentiment = weighted_sentiment.rename(columns={'listing_id': 'id'})

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1778996 entries, 0 to 1783640
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   listing_id  int64  
 1   comments    object 
 2   sentiment   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 54.3+ MB


In [9]:
weighted_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68704 entries, 0 to 68703
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  68704 non-null  int64  
 1   weighted_sentiment  68704 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.0 MB


In [10]:
# Exporteren naar CSV-bestand
weighted_sentiment.to_csv('weighted_sentiment.csv', index=False)

# Controleer of het bestand correct is opgeslagen
print("Weighted sentiment scores zijn opgeslagen in weighted_sentiment.csv")

Weighted sentiment scores zijn opgeslagen in weighted_sentiment.csv
