In [None]:
# imports

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
        col, from_unixtime
)
from pyspark.sql.types import StringType, IntegerType, FloatType ,StructField, StructType, TimestampType

In [None]:
# Spark Session 

spark = (
    SparkSession.builder
        .appName("load_postgres")
        .config("spark.jars", "jars/postgresql-42.7.7.jar")
        .config("spark.driver.extraClassPath", "jars/postgresql-42.7.7.jar")
        .config("spark.executor.extraClassPath", "jars/postgresql-42.7.7.jar")
        .getOrCreate()
)

In [None]:
spark

In [None]:
#Setting JDBC Connection

url = "jdbc:postgresql://localhost:5432/mydb"

properties = {
    "user": "root",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

In [None]:
def read_csv(header_option= True, format='csv', path=''):
    return spark.read.option("header",header_option).format(format).load(path)

In [None]:
def write_to_db(df,table_name,mode='overwrite',url=url, properties=properties):
    df.write.jdbc(url,table_name,mode=mode, properties=properties)

In [None]:
movies_df = spark.read.option("header",True).format("csv").load("data/MoviesData/movies.csv")

In [None]:
movies_df.printSchema()

In [None]:
movie_casted = (
    movies_df.select(col('movieId').cast(IntegerType()),col('title'), col('genres'))
)

In [None]:
write_to_db(df=movie_casted, table_name='movies')

In [None]:
ratings_df = spark.read.option("header",True).format("csv").load("data/MoviesData/ratings.csv")

In [None]:
ratings_df.printSchema()
ratings_df.show(5)

In [None]:
ratings_df_timestampf = ( 
    ratings_df.withColumn('timestamp', from_unixtime(col('timestamp')).cast(TimestampType()))
    .select(col('userId').cast(IntegerType()), col('movieId').cast(IntegerType()), col('rating').cast(FloatType()), col('timestamp'))
)
ratings_df_timestampf.printSchema()
ratings_df_timestampf.show()

In [None]:
ratings_df_timestampf.cache()

ratings_df_timestampf.write.format('noop')

In [None]:
write_to_db(df=ratings_df_timestampf, table_name='ratings')

In [None]:
ratings_df_timestampf.unpersist()
ratings_df_timestampf.write.format('noop')

In [None]:
tags_df = read_csv(path="data/MoviesData/tags.csv")

In [None]:
tags_df.printSchema()
tags_df.show()

In [None]:
tags_df_timestampf = ( 
    tags_df.withColumn('timestamp', from_unixtime(col('timestamp')).cast(TimestampType()))
    .select(col('userId').cast(IntegerType()), col('movieId').cast(IntegerType()), col('tag'), col('timestamp'))
)

tags_df_timestampf.printSchema()
tags_df_timestampf.show()


In [None]:
write_to_db(df=tags_df_timestampf, table_name='tags')