### Adding extra movie data using https://www.omdbapi.com/

In [2]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("MyApp").setMaster("local[*]")\
                  .set("spark.driver.memory", "2g")
sc = SparkContext(conf=conf)


In [8]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

In [9]:
spark

In [19]:
df_spark = spark.read.csv("../datasets/imdb/train_movies_extra_data.csv", inferSchema=True, header=True)

In [24]:
df_spark.toPandas()

Unnamed: 0,_c0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,genre,plot,actors,language,country,awards,boxoffice,rating,entry_type,production
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True,"Comedy, Fantasy, Sci-Fi",Because the Baron of Chanterelle wants to pres...,"Ossi Oswalda, Hermann Thimig, Victor Janson","English, German",Germany,,,7.4,movie,
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True,"Drama, Romance","The callous rich, portrayed by Lennox, think o...","Lillian Gish, Richard Barthelmess, Mrs. David ...",English,United States,,,7.4,movie,United Artists
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True,"Drama, Fantasy, Horror",As a young couple stops and rests in a small v...,"Bernhard Goetzke, Lil Dagover, Walter Janssen",German,Germany,2 nominations,"$12,156",7.6,movie,
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True,"Action, Comedy, Romance","""Rollo decides to marry his sweetheart Betsy a...",Betsy boards the ship to look for her father....,"Buster Keaton, Kathryn McGuire, Frederick Vroom",English,United States,1 win & 1 nomination,,7.6,movie
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True,Horror,"At the Opera of Paris, a mysterious phantom th...","Lon Chaney, Mary Philbin, Norman Kerry",,United States,2 wins & 1 nomination,"$3,751,476",7.5,movie,Hammer Films
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,\N,87,12951.0,False,"Action, Thriller",Alone and trapped in a locked-down hospital is...,"Nicky Whelan, Bruce Willis, Tito Ortiz","English, Spanish",United States,,,4.0,movie,MoviePass Films
7955,9981,tt9741310,Slaxx,Slaxx,2020,\N,77,2464.0,False,"Comedy, Horror",When a possessed pair of jeans begins to kill ...,"Romane Denis, Brett Donahue, Sehar Bhojani",English,Canada,1 win & 1 nomination,,5.4,movie,
7956,9982,tt9742392,Kindred,Kindred,2020,\N,101,1719.0,False,"Drama, Horror, Mystery",When her boyfriend dies suddenly in an acciden...,"Tamara Lawrance, Edward Holcroft, Jack Lowden",English,"United Kingdom, United Arab Emirates, Germany,...",4 nominations,"$8,921",5.0,movie,IFC Midnight
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,\N,111,4144.0,True,"Documentary, Biography, Music","An exploration of the history of the Bee Gees,...","Barry Gibb, Maurice Gibb, Robin Gibb",English,United States,Won 1 Primetime Emmy. 4 wins & 8 nominations t...,,8.1,movie,


In [12]:
df_spark.cache()
df_spark.count()

7959

In [15]:
import requests
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the UDF function that takes a value from a DataFrame column and returns data from an API
def api_udf(value):

    genre = None
    plot = None
    actors = None
    language = None
    country = None
    awards = None
    boxoffice = None
    rating = None
    entry_type = None
    production = None
    try:
        # Make an API call with the value
        response = requests.get(f"https://www.omdbapi.com/?plot=full&i={value}&apikey=43c7c72f")

        # Extract the data from the API response
        data = response.json()
        genre = data["Genre"]
        plot = data["Plot"]
        actors = data["Actors"]
        language = data["Language"]
        country = data["Country"]
        awards = data["Awards"]
        boxoffice = data["BoxOffice"]
        rating = data["imdbRating"]
        entry_type = data["Type"]
        production = data["Production"]
    except:
        print(f"error arose for movie with id {value}")

    # Return a tuple with the new column values
    return genre, plot, actors, language,country, awards, boxoffice, rating, entry_type, production

# Define the schema for the new columns
new_columns_schema = StructType([
    StructField("genre", StringType(), True),
    StructField("plot", StringType(), True),
    StructField("actors", StringType(), True),
    StructField("language", StringType(), True),
    StructField("country", StringType(), True),
    StructField("awards", StringType(), True),
    StructField("boxoffice", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("entry_type", StringType(), True),
    StructField("production", StringType(), True),
])

# Create the UDF
api_udf = udf(api_udf, new_columns_schema)

# Apply the UDF on a DataFrame column to create new columns with data from the API
df_spark = df_spark.withColumn("new_columns", api_udf(df_spark["tconst"]))

# Extract the new columns from the struct column and add them to the DataFrame
df_spark = df_spark.withColumn("genre", df_spark["new_columns"]["genre"])
df_spark = df_spark.withColumn("plot", df_spark["new_columns"]["plot"])
df_spark = df_spark.withColumn("actors", df_spark["new_columns"]["actors"])
df_spark = df_spark.withColumn("language", df_spark["new_columns"]["language"])
df_spark = df_spark.withColumn("country", df_spark["new_columns"]["country"])
df_spark = df_spark.withColumn("awards", df_spark["new_columns"]["awards"])
df_spark = df_spark.withColumn("boxoffice", df_spark["new_columns"]["boxoffice"])
df_spark = df_spark.withColumn("rating", df_spark["new_columns"]["rating"])
df_spark = df_spark.withColumn("entry_type", df_spark["new_columns"]["entry_type"])
df_spark = df_spark.withColumn("production", df_spark["new_columns"]["production"])

# Drop the struct column
df_spark = df_spark.drop("new_columns")


In [None]:
df_spark.cache()
df_spark.show()


In [None]:
df_spark.toPandas().to_csv("train_movies_extra_data.csv", index=False)