# Spark session

In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master("local")
         .appName("load-postgres")
         # Add postgres jar
         .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-9.4.1207.jar")
         .getOrCreate())
sc = spark.sparkContext

21/10/31 05:58:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Read CSV data

In [2]:
df_movies_csv = (
    spark.read
    .format("csv")
    .option("header", True)
    .load("/home/jovyan/work/data/movies.csv")
)

In [8]:
df_movies_csv.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [3]:
df_ratings_csv = (
    spark.read
    .format("csv")
    .option("header", True)
    .load("/home/jovyan/work/data/ratings.csv")
    .withColumnRenamed("timestamp","timestamp_epoch")
)

In [4]:
# Convert epoch to timestamp and rating to DoubleType
from pyspark.sql.functions import from_unixtime, col, to_timestamp
from pyspark.sql.types import DoubleType

df_ratings_csv_fmt = (
    df_ratings_csv
    .withColumn('rating', col("rating").cast(DoubleType()))
    .withColumn('timestamp', to_timestamp(from_unixtime(col("timestamp_epoch"))))
)

# Load data to Postgres

In [5]:
(df_movies_csv.write
 .format("jdbc")
 .option("url", "jdbc:postgresql://postgres/test")
 .option("dbtable", "public.movies")
 .option("user", "test")
 .option("password", "postgres")
 .mode("overwrite")
 .save())

In [6]:
(df_ratings_csv_fmt
 .select([c for c in df_ratings_csv_fmt.columns if c != "timestamp_epoch"])
 .write
 .format("jdbc")
 .option("url", "jdbc:postgresql://postgres/test")
 .option("dbtable", "public.ratings")
 .option("user", "test")
 .option("password", "postgres")
 .mode("overwrite")
 .save())

                                                                                

In [7]:
df_ratings_csv_fmt.show()

+------+-------+------+---------------+-------------------+
|userId|movieId|rating|timestamp_epoch|          timestamp|
+------+-------+------+---------------+-------------------+
|     1|      1|   4.0|      964982703|2000-07-30 18:45:03|
|     1|      3|   4.0|      964981247|2000-07-30 18:20:47|
|     1|      6|   4.0|      964982224|2000-07-30 18:37:04|
|     1|     47|   5.0|      964983815|2000-07-30 19:03:35|
|     1|     50|   5.0|      964982931|2000-07-30 18:48:51|
|     1|     70|   3.0|      964982400|2000-07-30 18:40:00|
|     1|    101|   5.0|      964980868|2000-07-30 18:14:28|
|     1|    110|   4.0|      964982176|2000-07-30 18:36:16|
|     1|    151|   5.0|      964984041|2000-07-30 19:07:21|
|     1|    157|   5.0|      964984100|2000-07-30 19:08:20|
|     1|    163|   5.0|      964983650|2000-07-30 19:00:50|
|     1|    216|   5.0|      964981208|2000-07-30 18:20:08|
|     1|    223|   3.0|      964980985|2000-07-30 18:16:25|
|     1|    231|   5.0|      964981179|2

DataFrame[userId: string, movieId: string, rating: double, timestamp_epoch: string, timestamp: timestamp]