In [95]:
import pandas as pd

movie_df = pd.read_csv("../data/rotten_tomatoes_movies.csv")
review_df = pd.read_csv("../data/rotten_tomatoes_movie_reviews.csv")

In [50]:
movie_df["age_class"] = movie_df["rating"].fillna("G")
movie_df["ratingContents"] = movie_df["ratingContents"].fillna("None")
movie_df["distributor"] = movie_df["distributor"].fillna("independent")
movie_df.drop(columns=["soundMix", "boxOffice", "rating"], inplace=True)

In [51]:
movie_df.dropna(
    subset=[
        "audienceScore",
        "tomatoMeter",
        "releaseDateTheaters",
        "runtimeMinutes",
        "genre",
        "director",
        "originalLanguage",
    ],
    inplace=True,
)

In [52]:
movie_df["releaseDateTheaters"] = pd.to_datetime(
    movie_df["releaseDateTheaters"], format="%Y-%m-%d"
)
movie_df["releaseDay"] = movie_df["releaseDateTheaters"].dt.day
movie_df["releaseMonth"] = movie_df["releaseDateTheaters"].dt.month
movie_df["releaseYear"] = movie_df["releaseDateTheaters"].dt.year
movie_df["releaseWeekday"] = movie_df["releaseDateTheaters"].dt.weekday

movie_df.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,distributor,age_class,releaseDay,releaseMonth,releaseYear,releaseWeekday
5,adrift_2018,Adrift,65.0,69.0,"['Injury Images', 'Brief Drug Use', 'Thematic ...",2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",STX Films,PG-13,1,6,2018,4
9,1035316-born_to_kill,Born to Kill,74.0,83.0,,1947-04-30,2016-05-23,92.0,"Crime, Drama",English,Robert Wise,"Eve Greene,Richard Macaulay",independent,G,30,4,1947,2
17,1221483-paa,Paa,67.0,50.0,,2009-12-04,,133.0,Drama,Hindi,R. Balki,R. Balki,Big Pictures,G,4,12,2009,4
20,sarah_palin_you_betcha,Sarah Palin: You Betcha!,61.0,32.0,,2011-09-30,2017-03-08,90.0,Documentary,English,"Nick Broomfield,Joan Churchill",,Freestyle Releasing,G,30,9,2011,4
31,a_state_of_mind_2005,A State of Mind,92.0,89.0,,2005-08-10,2007-06-11,93.0,Documentary,Korean,Daniel Gordon,,Kino Pictures,G,10,8,2005,2


In [96]:
review_df["publicationName"] = review_df["publicatioName"]
review_df.drop(columns="publicatioName")
review_df.dropna(subset="originalScore", inplace=True)

review_df["creationDate"] = pd.to_datetime(review_df["creationDate"])
review_df["creationDay"] = review_df["creationDate"].dt.day
review_df["creationMonth"] = review_df["creationDate"].dt.month
review_df["creationYear"] = review_df["creationDate"].dt.year
review_df["creationWeekday"] = review_df["creationDate"].dt.weekday

In [104]:
import re
import numpy as np

review_df["publicationName"] = review_df["publicatioName"]
review_df.drop(columns="publicatioName")
review_df.dropna(subset="originalScore", inplace=True)

review_df["creationDate"] = pd.to_datetime(review_df["creationDate"])
review_df["creationDay"] = review_df["creationDate"].dt.day
review_df["creationMonth"] = review_df["creationDate"].dt.month
review_df["creationYear"] = review_df["creationDate"].dt.year
review_df["creationWeekday"] = review_df["creationDate"].dt.weekday

grade_dct = {
    "A+": 0.985,
    "A": 0.945,
    "A-": 0.91,
    "B+": 0.88,
    "B": 0.845,
    "B-": 0.81,
    "C+": 0.78,
    "C": 0.745,
    "C-": 0.71,
    "D+": 0.68,
    "D": 0.645,
    "D-": 0.61,
    "F": 0.295,
}


def fix_rating(x, grade_dct):
    try:
        if x in grade_dct.keys():
            return grade_dct[x]
        else:
            return float(re.sub("\"|'|\*| ", "", re.split("/", x)[0])) / float(
                re.sub("\"|'|\*| ", "", re.split("/", x)[1])
            )
    except:
        return np.NaN


review_df["originalScore"] = review_df["originalScore"].apply(
    fix_rating, grade_dct=grade_dct
)
review_df.loc[review_df["originalScore"] > 1, "originalScore"] = 1
review_df.loc[review_df["originalScore"] < 0, "originalScore"] = 0
review_df.dropna(subset="originalScore", inplace=True)

In [None]:
from nltk.corpus import stopwords

stop_wrds = set(stopwords.words("english"))
[w for w in list if not w.lower() in stop_wrds]

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set(
    "fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"
)

#  Google Storage File Path
gsc_file_path = "gs://data_de2023_2065718"  #  use your gcp bucket name.
# Create data frame
movie_df = (
    spark.read.format("csv")
    .option("header", "true")
    .load(gsc_file_path + "/rotten_tomatoes_movies.csv")
)
# movie_df.printSchema()

# movie_df.show()
# Create data frame
review_df = (
    spark.read.format("csv")
    .option("header", "true")
    .load(gsc_file_path + "/rotten_tomatoes_movie_reviews.csv")
)
# review_df.printSchema()

# review_df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

# Handling missing values
movie_df = (
    movie_df.withColumn(
        "age_class", when(col("rating").isNotNull(), col("rating")).otherwise("G")
    )
    .withColumn(
        "ratingContents",
        when(col("ratingContents").isNotNull(), col("ratingContents")).otherwise(
            "None"
        ),
    )
    .withColumn(
        "distributor",
        when(col("distributor").isNotNull(), col("distributor")).otherwise(
            "independent"
        ),
    )
)

# Dropping columns
columns_to_drop = ["soundMix", "boxOffice", "rating"]
movie_df = movie_df.drop(*columns_to_drop)

# Dropping rows with missing values in specific columns
columns_to_check_for_null = [
    "audienceScore",
    "tomatoMeter",
    "releaseDateTheaters",
    "runtimeMinutes",
    "genre",
    "director",
    "originalLanguage",
]
movie_df = movie_df.dropna(subset=columns_to_check_for_null)

# Converting releaseDateTheaters to datetime
movie_df = movie_df.withColumn(
    "releaseDateTheaters", to_date(col("releaseDateTheaters"), "y-M-d")
)  # our dataframe with date column

# Extracting date components
movie_df = (
    movie_df.withColumn("releaseDay", day(col("releaseDateTheaters")))
    .withColumn("releaseMonth", month(col("releaseDateTheaters")))
    .withColumn("releaseYear", year(col("releaseDateTheaters")))
    .withColumn("releaseWeekday", weekday(col("releaseDateTheaters")))
)

# Show the resulting PySpark DataFrame
movie_df.show()

In [None]:
from pyspark.sql.types import FloatType, ArrayType
from nltk.corpus import stopwords
import nltk
import numpy as np

review_df = review_df.withColumnRenamed("publicatioName", "publicationName")
review_df = review_df.dropna(subset="originalScore")

# Converting releaseDateTheaters to datetime
review_df = review_df.withColumn(
    "creationDate", to_date(col("creationDate"), "y-M-d")
)  # our dataframe with date column

# Extracting date components
review_df = (
    review_df.withColumn("creationDay", day(col("creationDate")))
    .withColumn("creationMonth", month(col("creationDate")))
    .withColumn("creationYear", year(col("creationDate")))
    .withColumn("creationWeekday", weekday(col("creationDate")))
)

# Define the grade dictionary as a broadcast variable for efficient use in UDF
grade_dct = spark.sparkContext.broadcast(
    {
        "A+": 0.985,
        "A": 0.945,
        "A-": 0.91,
        "B+": 0.88,
        "B": 0.845,
        "B-": 0.81,
        "C+": 0.78,
        "C": 0.745,
        "C-": 0.71,
        "D+": 0.68,
        "D": 0.645,
        "D-": 0.61,
        "F": 0.295,
    }
)


# Define the UDF for fixing the rating
def fix_rating_udf(x):
    try:
        if x in grade_dct.value.keys():
            return grade_dct.value[x]
        else:
            split_values = list(
                map(
                    lambda s: float(
                        s.replace('"', "").replace("'", "").replace("*", "").strip()
                    ),
                    x.split("/"),
                )
            )
            return split_values[0] / split_values[1]
    except:
        return None


# Register the UDF
fix_rating_spark_udf = udf(fix_rating_udf, FloatType())

# Apply the UDF to the 'originalScore' column
review_df = review_df.withColumn(
    "originalScore", fix_rating_spark_udf(col("originalScore"))
)

# Apply additional conditions to 'originalScore' column
review_df = review_df.withColumn(
    "originalScore", when(col("originalScore") > 1, 1).otherwise(col("originalScore"))
)
review_df = review_df.withColumn(
    "originalScore", when(col("originalScore") < 0, 0).otherwise(col("originalScore"))
)

# Tokenize and lower the token
review_df = review_df.withColumn("tokenized_review", split(col("reviewText"), " "))

# Load the English stopwords set
nltk.download("stopwords")
stop_wrds = set(stopwords.words("english"))


# Define a UDF to remove stopwords
def remove_stopwords_udf(tokens):
    return [word for word in tokens if word.lower() not in stop_wrds]


# Register the UDF
remove_stopwords_spark_udf = udf(remove_stopwords_udf, ArrayType(StringType()))

# Apply the UDF to the 'tokenized_review' column
review_df = review_df.dropna(subset="tokenized_review")
review_df = review_df.withColumn(
    "filtered_review", remove_stopwords_spark_udf(col("tokenized_review"))
)

# Obtain the word counts per "tokenized_review"
review_df = review_df.withColumn("word", explode("filtered_review"))
# Group by the word and count the occurrences
review_df = review_df.groupBy("id", "word").count()

# Show the resulting PySpark DataFrame
review_df.orderBy(col("id")).show()

In [None]:
from pyspark.sql.functions import *
from pyspark.sql import Row, Window

windowdesc = Window.partitionBy(col("id")).orderBy(col("originalScore").desc())
windowasc = Window.partitionBy(col("id")).orderBy(col("originalScore").asc())

review_merged_windowed = review_df.withColumn(
    "rank_desc", percent_rank().over(windowdesc)
)
# w = Window.partitionBy('id')
# review_merged_windowed = (review_merged_windowed.withColumn('mini', min(c).over(w))
#     .withColumn('maxi', max(c).over(w))
#     .withColumn(c, ((col(c) - col('mini')) / (col('maxi') - col('mini'))))
#     .drop('mini')
#        .drop('maxi'))


# Get critic with harshes reviews
review_merged_windowed.groupBy("criticName").avg("rank_desc").orderBy(
    col("avg(rank_desc)").asc()
).show()