Business Analysis

User Analysis

Review Analysis 

In [None]:

%pyspark

# 4. Extract Top 20 most common words from all reviews (stopwords removed)
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import explode, col, lower

# Tokenize review texts
tokenized = Tokenizer(inputCol="rev_text", outputCol="words").transform(review)
 
# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered = remover.transform(tokenized)

# Explode words, convert to lowercase, group, count, order, and show top 20
filtered.select(explode(col("filtered_words")).alias("word")) \
    .select(lower(col("word")).alias("word")) \
    .groupBy("word").count() \
    .orderBy(col("count").desc()) \
    .show(20, truncate=False)


#by RIDA 

In [None]:
%pyspark

# 5. Extract Top 10 words from positive reviews (rating > 3), stopwords removed
positive_reviews = review.filter(col("rev_stars") > 3)

# Tokenize and remove stopwords
tokenized_pos = Tokenizer(inputCol="rev_text", outputCol="words").transform(positive_reviews)
filtered_pos = remover.transform(tokenized_pos)

# Explode, lowercase, count, and show top 10
filtered_pos.select(explode(col("filtered_words")).alias("word")) \
    .select(lower(col("word")).alias("word")) \
    .groupBy("word").count() \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)


#by RIDA 

In [None]:
%pyspark

# 6. Extract Top 10 words from negative reviews (rating ≤ 3), stopwords removed
negative_reviews = review.filter(col("rev_stars") <= 3)

# Tokenize and remove stopwords
tokenized_neg = Tokenizer(inputCol="rev_text", outputCol="words").transform(negative_reviews)
filtered_neg = remover.transform(tokenized_neg)

# Explode, lowercase, count, and show top 10
filtered_neg.select(explode(col("filtered_words")).alias("word")) \
    .select(lower(col("word")).alias("word")) \
    .groupBy("word").count() \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)


#by RIDA

Rating Analysis 

In [None]:
%pyspark 

from pyspark.sql.functions import split, explode, to_timestamp, year, hour, count, trim, col

# ---------------------------------------------
# 1. Count the number of check-ins per year
# ---------------------------------------------
# Step 1: Split check-in dates and explode into individual timestamps
checkin_exploded = checkin.withColumn("checkin_date", explode(split(col("checkin_dates"), ",")))

# Step 2: Extract year and count check-ins per year
checkin_exploded.withColumn("checkin_date_ts", to_timestamp(trim(col("checkin_date")), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("year", year(col("checkin_date_ts"))) \
    .groupBy("year").agg(count("*").alias("checkin_count")) \
    .orderBy("year").show(10, False)


# ---------------------------------------------
# 2. Count the number of check-ins per hour within a 24-hour period
# ---------------------------------------------
# Step 1 & 2 reused: already exploded and timestamped
# Step 3: Extract hour and count
checkin_exploded.withColumn("checkin_date_ts", to_timestamp(trim(col("checkin_date")), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("hour", hour(col("checkin_date_ts"))) \
    .groupBy("hour").agg(count("*").alias("checkin_count")) \
    .orderBy("hour").show(24, False)


# ---------------------------------------------
# 3. Identify the most popular city for check-ins
# ---------------------------------------------
# Join check-in data with business to get city names, then count check-ins per city
checkin_exploded.join(business, checkin_exploded.business_id == business.business_id, "inner") \
    .groupBy("city").agg(count("*").alias("total_checkins")) \
    .orderBy(col("total_checkins").desc()).show(10, False)



#by Rida

Checkin Analysis

In [None]:
%pyspark
# -----------------------------------------------------------
# Requirement V.4: Rank all businesses based on check-in counts
# -----------------------------------------------------------

# Step 1: Split and explode check-in dates to individual timestamps
checkin_exploded = checkin.withColumn("checkin_date", explode(split(col("checkin_dates"), ",")))

# Step 2: Count total check-ins for each business
business_checkin_counts = checkin_exploded.groupBy("business_id").agg(count("*").alias("total_checkins"))

# Step 3: Join with business dataset to get business names and cities (optional)
ranked_businesses = business_checkin_counts.join(
    business.select("business_id", "name", "city"), 
    "business_id", 
    "inner"
)

# Step 4: Order by check-in count descending to get ranking
ranked_businesses.orderBy(col("total_checkins").desc()).show(20, False)


#by Rida

Comprehensive Analysis 

In [None]:
%pyspark

# -----------------------------------------------
# Comprehensive Analysis: Top 5 merchants per city 
# Based on review count, average rating, check-in count
# -----------------------------------------------

from pyspark.sql.functions import col, count, avg, split, explode, row_number
from pyspark.sql.window import Window

# --- Review count and average rating per business ---
review_stats = review.groupBy("rev_business_id").agg(
    count("*").alias("review_count"),
    avg("rev_stars").alias("average_rating")
)

# --- Check-in count per business ---
checkin_exploded = checkin.withColumn("checkin_date", explode(split(col("checkin_dates"), ",")))
checkin_stats = checkin_exploded.groupBy("business_id").agg(count("*").alias("checkin_count"))

# --- Join all stats with business info ---
biz_info = business.select("business_id", "name", "city")
stats = biz_info \
    .join(review_stats, biz_info.business_id == review_stats.rev_business_id, "left") \
    .join(checkin_stats, "business_id", "left") \
    .fillna(0, subset=["review_count", "average_rating", "checkin_count"])

# --- Rank top 5 merchants in each city ---
window = Window.partitionBy("city").orderBy(
    col("review_count").desc(),
    col("average_rating").desc(),
    col("checkin_count").desc()
)
top5 = stats.withColumn("rank", row_number().over(window)).filter(col("rank") <= 5)

# --- Show final result ---
top5.select("city", "name", "review_count", "average_rating", "checkin_count", "rank") \
    .orderBy("city", "rank").show(100, False)


#by Rida 

In [None]:
%pyspark

# 2. Count useful, funny, cool reviews
review.agg(sum("rev_useful").alias("useful_votes"), sum("rev_funny").alias("funny_votes"), sum("rev_cool").alias("cool_votes")).show(False)

In [None]:
%pyspark

# 3. Rank users by total number of reviews each year
review.withColumn("review_year", year(to_date(col("rev_date"), "yyyy-MM-dd"))).groupBy("review_year", "rev_user_id").agg(count("*").alias("review_count")).orderBy(desc("review_count")).show(100, False)