In [1]:
import os
import pyspark 
from pyspark.sql import SparkSession
import seaborn as sb
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from utils import preprocces_text



In [2]:
from dotenv import load_dotenv
load_dotenv()
key_filepath = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
os.environ['PYSPARK_PYTHON'] = os.getenv("PYSPARK_PYTHON")


In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Airbnb Listings") \
    .config("spark.network.timeout", "1800s") \
    .config("spark.executor.heartbeatInterval", "600s") \
    .config("spark.shuffle.io.connectionTimeout", "3600s") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile",key_filepath  ) \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .getOrCreate()       


    

In [4]:

# Read data from GCS
df = spark.read.parquet("gs://airbnb-listings-421017-bucket/airbnb_listings.parquet")


In [5]:
top_100_summary = df.select("summary").filter(col("summary") != "N/A").sort(col("price").desc()).limit(100).collect()

least_100_summary = df.select("summary").filter(col("summary") != "N/A").sort(col("price").asc()).limit(100).collect()

In [6]:
summary_preprocessed = [preprocces_text(row.summary) for row in top_100_summary]
summary_preprocessed = ' '.join([' '.join(row) for row in summary_preprocessed])

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width = 500, height = 400, background_color="white").generate(summary_preprocessed)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [8]:
least100_summary_preprocessed = [preprocces_text(row.summary) for row in least_100_summary]
least100_summary_preprocessed = ' '.join([' '.join(row) for row in least100_summary_preprocessed])

In [9]:
print(least100_summary_preprocessed)

enjoy comfy pull-out bed capitol hill neighborhood block away seattle trendiest restaurant bar nightlife -5 minute walk heart capitol hill -15 – minute walk minute drive downtown seattle low budget traveler need place sleep shower toilet found listing providing two person tent sleeping pad sleeping bag also three person tent available min night stay single bed conner laundry room clean quiet easiest commute town seatac airport place take min via link light rail place downtown seattle min ride 3min foot station airbnb location ... walking distance downtown international district cap hill pike place market restaurant bar club seattle u. within busing distance uw space needle listing bottom bunk minute walk broadway heart capitol hill downtown seattle despite living bar noise level night small bus ride away u-district airbnb location ... walking distance downtown international district cap hill pike place market restaurant bar club seattle u. within busing distance uw space needle listing

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width = 500, height = 400, background_color="white").generate(least100_summary_preprocessed)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

#### Analyzing if review sentiment has any relation with price


In [11]:
reviews_df  = spark.read.csv("reviews.csv", header=True, inferSchema=True)

reviews_df = reviews_df.withColumn("date",col("date").cast("date"))

# drop rows with null values
reviews_df = reviews_df.dropna()

In [12]:
from langdetect import detect
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def detect_language(text):
    try:
        return detect(str(text))
    except:
        return None
# Define a UDF to apply language detection to each comment in the DataFrame
detect_language_udf = udf(detect_language, StringType())

reviews_df = reviews_df.withColumn("lang", detect_language_udf(reviews_df["comments"]))


In [13]:
# filter only english comments
reviews_df = reviews_df.filter(col("lang") == "en")

In [14]:
reviews_rdd = reviews_df.rdd
# apply detect_language to each comment in rdd 
reviews_rdd = reviews_rdd.map(lambda row: (row.id, row.listing_id, row.date, row.reviewer_id, row.reviewer_name, row.comments, detect_language(row.comments)))

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.sql.types import FloatType, StructType, StructField


sentiment_intensity = SentimentIntensityAnalyzer()

sentiment_intensity_udf = udf(lambda x: sentiment_intensity.polarity_scores(x),StructType([
    StructField("neg", FloatType(), True),
    StructField("neu", FloatType(), True),
    StructField("pos", FloatType(), True),
    StructField("compound", FloatType(), True)
]))

reviews_df = reviews_df.withColumn("sentiment_scores", sentiment_intensity_udf(reviews_df["comments"]))

reviews_df = reviews_df.withColumn("pos", col("sentiment_scores")["pos"].cast(FloatType()))

reviews_df = reviews_df.withColumn("neg", col("sentiment_scores")["neg"].cast(FloatType()))

reviews_df = reviews_df.withColumn("neu", col("sentiment_scores")["neu"].cast(FloatType()))

reviews_df = reviews_df.withColumn("compound", col("sentiment_scores")["compound"].cast(FloatType()))


lol = reviews_df.toPandas()

