In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

credentials_location = '/home/jovyan/credentials/google_credentials.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/jovyan/lib/gcs-connector-hadoop3-latest.jar,/home/jovyan/lib/spark-3.1-bigquery-0.27.1-preview.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [2]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [3]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [4]:
df_titles = spark.read.parquet('gs://dtc_data_lake_zoomcamp-final/raw/dataset/titles.parquet')

In [5]:
df_credits = spark.read.parquet('gs://dtc_data_lake_zoomcamp-final/raw/dataset/credits.parquet')

In [6]:
df_titles

DataFrame[id: string, title: string, type: string, description: string, release_year: bigint, age_certification: string, runtime: bigint, genres: string, production_countries: string, seasons: double, imdb_id: string, imdb_score: double, imdb_votes: double, tmdb_popularity: double, tmdb_score: double]

In [7]:
df_credits

DataFrame[person_id: bigint, id: string, name: string, character: string, role: string]

In [8]:
df_titles.head(5)

[Row(id='ts300399', title='Five Came Back: The Reference Films', type='SHOW', description='This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discussed in the docuseries "Five Came Back."', release_year=1945, age_certification='TV-MA', runtime=51, genres="['documentation']", production_countries="['US']", seasons=1.0, imdb_id='', imdb_score=None, imdb_votes=None, tmdb_popularity=0.6, tmdb_score=None),
 Row(id='tm84618', title='Taxi Driver', type='MOVIE', description='A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.', release_year=1976, age_certification='R', runtime=114, genres="['drama', 'crime']", production_countries="['US']", seasons=None, imdb_id='tt0075314', imdb_score=8.2, imdb_votes=808582.0, tmdb_popularity=40.965, tmdb_score=8.179),
 Row(id='tm154986', title='Deliverance', type='MOVIE', description="Intent 

In [9]:
df_credits.head(10)

[Row(person_id=3748, id='tm84618', name='Robert De Niro', character='Travis Bickle', role='ACTOR'),
 Row(person_id=14658, id='tm84618', name='Jodie Foster', character='Iris Steensma', role='ACTOR'),
 Row(person_id=7064, id='tm84618', name='Albert Brooks', character='Tom', role='ACTOR'),
 Row(person_id=3739, id='tm84618', name='Harvey Keitel', character="Matthew 'Sport' Higgins", role='ACTOR'),
 Row(person_id=48933, id='tm84618', name='Cybill Shepherd', character='Betsy', role='ACTOR'),
 Row(person_id=32267, id='tm84618', name='Peter Boyle', character='Wizard', role='ACTOR'),
 Row(person_id=519612, id='tm84618', name='Leonard Harris', character='Senator Charles Palantine', role='ACTOR'),
 Row(person_id=29068, id='tm84618', name='Diahnne Abbott', character='Concession Girl', role='ACTOR'),
 Row(person_id=519613, id='tm84618', name='Gino Ardito', character='Policeman at Rally', role='ACTOR'),
 Row(person_id=3308, id='tm84618', name='Martin Scorsese', character='Passenger Watching Silhouet

In [67]:
facts_table = spark.createDataFrame(df_titles.join(df_credits, ["id"], "inner").collect())

In [57]:
bucket = "dtc_data_lake_zoomcamp-final/temp/"

In [58]:
spark.conf.set("temporaryGcsBucket", bucket)

In [29]:
df_credits.write.format("bigquery").option("table","netflix_dataset.dim_credits").save()

In [30]:
df_titles.write.format("bigquery").option("table","netflix_dataset.dim_titles").save()

In [68]:
facts_table.write.format("bigquery").option("table","netflix_dataset.facts_table").save()