### Adding extra movie data using https://www.omdbapi.com/

In [2]:
from pyspark.sql import SparkSession
import requests
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType

spark = SparkSession.builder.appName("big_data").getOrCreate()

odb_api_key = "Replace with your own key"

In [9]:
spark

### Read train, validation and test data into Spark dataframe

In [None]:
# Train df contains all 8 different CSVs concatenated
train_df_spark = spark.read.option("escape", "\"").csv("/kaggle/input/imdb-big-data/complete_train.csv", inferSchema=True, header=True)
validation_df_spark = spark.option("escape", "\"").read.csv("/kaggle/input/imdb-big-data/validation_hidden.csv", inferSchema=True, header=True)
test_df_spark = spark.read.option("escape", "\"").csv("/kaggle/input/imdb-big-data/test_hidden.csv", inferSchema=True, header=True)

In [None]:
def add_omdb_api_data(spark_df):
    # Define the UDF function that takes a value from a DataFrame column and returns data from an API
    def api_udf(value):

        try:
            # Make an API call with the value
            response = requests.get(f"https://www.omdbapi.com/?plot=full&i={value}&apikey={odb_api_key}")

            # Extract the data from the API response
            data = response.json()
            genre = data["Genre"]
            plot = data["Plot"]
            actors = data["Actors"]
            language = data["Language"]
            country = data["Country"]
            awards = data["Awards"]
            boxoffice = data["BoxOffice"]
            rating = data["imdbRating"]
            entry_type = data["Type"]
            production = data["Production"]
            imdb_votes = data["imdbVotes"]
            imdb_year = data["Year"]

            # Return a tuple with the new column values
            return genre, plot, actors, language,country, awards, boxoffice, rating, entry_type, production, imdb_votes, imdb_year

        except Exception as e:
            print(f"Error arose for movie with id {value}: ", e)
            # Put Nulls in all columns
            return (None, ) * 12

    new_columns = ["genre", "plot", "actors", "language", "country", "awards", "boxoffice", "rating", "entry_type", "production", "imdb_votes", "imdb_year"]

    # Define the schema for the new columns
    new_columns_schema = StructType([StructField(column, StringType(), True) for column in new_columns])

    # Create the UDF
    api_udf = udf(api_udf, new_columns_schema)

    # Apply the UDF on a DataFrame column to create new columns with data from the API
    df_spark = spark_df.withColumn("new_columns", api_udf(spark_df["tconst"]))

    # Extract the new columns from the struct column and add them to the DataFrame
    for col in new_columns:
        spark_df = spark_df.withColumn(col, spark_df["new_columns"].getField(col))

    # Drop the struct column
    spark_df = df_spark.drop("new_columns")

    return spark_df

In [None]:
# Apply function to add extra data on all 3 datasets
train_df_spark = add_omdb_api_data(train_df_spark)
validation_df_spark = add_omdb_api_data(validation_df_spark)
test_df_spark = add_omdb_api_data(test_df_spark)

In [None]:
train_df_spark.toPandas().to_csv("train_movies_extra_data_new.csv", index=False)
validation_df_spark.toPandas().to_csv("validation_extra_data.csv", index=False)
test_df_spark.toPandas().to_csv("test_extra_data.csv", index=False)