In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("MovieLens")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/03/10 03:10:21 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/10 03:10:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/10 03:10:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# hdfs dfs -ls /movies
# hdfs dfs -ls /ratings

In [4]:
# how to create schema programatially instead of using inferSchema
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType

movieSchema = StructType()\
                    .add("movieId", IntegerType(), True)\
                    .add("title", StringType(), True)\
                    .add("genres", StringType(), True)

ratingSchema = StructType()\
                    .add("userId", IntegerType(), True)\
                    .add("movieId", IntegerType(), True)\
                    .add("rating", DoubleType(), True)\
                    .add("timestamp", LongType(), True)

In [5]:
# read movie data
# read using dataframe with define schema
# we can use folder path -  all csv in the folder read
# use file path - only that file read

#spark is a session, entry point for data frame/sql
movieDf = spark.read.format('csv')\
                    .option('header', True)\
                    .schema(movieSchema)\
                    .load("hdfs://localhost:9000/movies")

movieDf.printSchema()
movieDf.show(2)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



                                                                                

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



In [6]:
ratingDf = spark.read.format('csv')\
                    .option('header', True)\
                    .schema(ratingSchema)\
                    .load("hdfs://localhost:9000/ratings")

ratingDf.printSchema()
ratingDf.show(2)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [7]:
print(movieDf.count())
print(ratingDf.count())

9742
100836


In [8]:
ratingDf.take(2)

[Row(userId=1, movieId=1, rating=4.0, timestamp=964982703),
 Row(userId=1, movieId=3, rating=4.0, timestamp=964981247)]

In [9]:
ratingDf.select('rating').show(5)

+------+
|rating|
+------+
|   4.0|
|   4.0|
|   4.0|
|   5.0|
|   5.0|
+------+
only showing top 5 rows



In [10]:
# show the distinct rating
ratingDf.select('rating').distinct().show()

                                                                                

+------+
|rating|
+------+
|   3.5|
|   4.5|
|   2.5|
|   1.0|
|   4.0|
|   0.5|
|   3.0|
|   2.0|
|   1.5|
|   5.0|
+------+



                                                                                

In [11]:
# aggregation with groupBy 
from pyspark.sql.functions import col, desc, avg, count

# find the movies by total ratings by user
df = ratingDf\
        .groupBy("movieId")\
        .agg(count("userId").alias("total_ratings"))\
        .sort(desc("total_ratings"))

df.printSchema()
df.show(20)

root
 |-- movieId: integer (nullable = true)
 |-- total_ratings: long (nullable = false)





+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|    356|          329|
|    318|          317|
|    296|          307|
|    593|          279|
|   2571|          278|
|    260|          251|
|    480|          238|
|    110|          237|
|    589|          224|
|    527|          220|
|   2959|          218|
|      1|          215|
|   1196|          211|
|     50|          204|
|   2858|          204|
|     47|          203|
|    780|          202|
|    150|          201|
|   1198|          200|
|   4993|          198|
+-------+-------------+
only showing top 20 rows



                                                                                

In [12]:
from pyspark.sql.functions import col, desc, avg, count

# find the movies by total ratings by user
df = ratingDf\
        .groupBy("movieId")\
        .agg(avg("rating").alias("avg_ratings"))\
        .sort(desc("avg_ratings"))

df.printSchema()
df.show(20)

root
 |-- movieId: integer (nullable = true)
 |-- avg_ratings: double (nullable = true)





+-------+-----------+
|movieId|avg_ratings|
+-------+-----------+
|  33138|        5.0|
|    876|        5.0|
| 147300|        5.0|
|  27373|        5.0|
|     53|        5.0|
|  25887|        5.0|
|  84273|        5.0|
| 113829|        5.0|
| 173963|        5.0|
|  26350|        5.0|
|  67618|        5.0|
|    148|        5.0|
| 157775|        5.0|
| 142444|        5.0|
|    633|        5.0|
|    496|        5.0|
|   8911|        5.0|
|   5513|        5.0|
| 152711|        5.0|
| 150554|        5.0|
+-------+-----------+
only showing top 20 rows



                                                                                

In [13]:
from pyspark.sql.functions import col, desc, avg, count

# find the popular movies, where as rated by many users, at least movies should be rated by 100 users
# and the average rating should be at least 3.5 and above
# and sort the movies by total_ratings
mostPopularMoviesDf = ratingDf\
        .groupBy("movieId")\
        .agg(avg("rating").alias("avg_rating"), count("userId").alias("total_ratings"))\
        .filter((col("total_ratings") >= 100) & (col("avg_rating") >= 3.5))\
        .sort(desc("total_ratings"))

mostPopularMoviesDf.cache()

mostPopularMoviesDf.printSchema()
mostPopularMoviesDf.show(20)

root
 |-- movieId: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_ratings: long (nullable = false)



                                                                                

+-------+------------------+-------------+
|movieId|        avg_rating|total_ratings|
+-------+------------------+-------------+
|    356| 4.164133738601824|          329|
|    318| 4.429022082018927|          317|
|    296| 4.197068403908795|          307|
|    593| 4.161290322580645|          279|
|   2571| 4.192446043165468|          278|
|    260| 4.231075697211155|          251|
|    480|              3.75|          238|
|    110| 4.031645569620253|          237|
|    589| 3.970982142857143|          224|
|    527|             4.225|          220|
|   2959| 4.272935779816514|          218|
|      1|3.9209302325581397|          215|
|   1196|4.2156398104265405|          211|
|     50| 4.237745098039215|          204|
|   2858| 4.056372549019608|          204|
|     47|3.9753694581280787|          203|
|    150| 3.845771144278607|          201|
|   1198|            4.2075|          200|
|   4993| 4.106060606060606|          198|
|   1210| 4.137755102040816|          196|
+-------+--

In [14]:
# join, inner join
# get the movie title for the mostPopularMoviesDf
# join mostPopularMoviesDf with moveiDf base on condiditon that mostPopularMoviesDf.moveId == movieDf.movieId

popularMoviesDf = mostPopularMoviesDf.join(movieDf, mostPopularMoviesDf.movieId == movieDf.movieId)\
                                     .select(movieDf.movieId, "title", "avg_rating", "total_ratings")
popularMoviesDf.show(100)

                                                                                

+-------+--------------------+------------------+-------------+
|movieId|               title|        avg_rating|total_ratings|
+-------+--------------------+------------------+-------------+
|      1|    Toy Story (1995)|3.9209302325581397|          215|
|      6|         Heat (1995)| 3.946078431372549|          102|
|     32|Twelve Monkeys (a...| 3.983050847457627|          177|
|     34|         Babe (1995)|        3.65234375|          128|
|     47|Seven (a.k.a. Se7...|3.9753694581280787|          203|
|     50|Usual Suspects, T...| 4.237745098039215|          204|
|    110|   Braveheart (1995)| 4.031645569620253|          237|
|    111|  Taxi Driver (1976)| 4.105769230769231|          104|
|    150|    Apollo 13 (1995)| 3.845771144278607|          201|
|    161| Crimson Tide (1995)|3.6359223300970873|          103|
|    165|Die Hard: With a ...|3.5555555555555554|          144|
|    223|       Clerks (1994)| 3.855769230769231|          104|
|    260|Star Wars: Episod...| 4.2310756

In [15]:
# write popularMoviesDf to hadopp with header [by defualt headers will not be written]
# overwrite existing files
# 70+ partitions having approx total of 100+ records
# write 70+ files into hadoop
popularMoviesDf.write.mode("overwrite")\
               .option("header", True)\
               .csv("hdfs://localhost:9000/most-popular-movies-many-files")

                                                                                

In [16]:
# write popularMoviesDf into a single file
# coalesce(1) to reduce partitions
popularMoviesDf.coalesce(1).write.mode("overwrite")\
               .option("header", True)\
               .csv("hdfs://localhost:9000/most-popular-movies")

In [17]:
# now read the files back from hdfs
# for schema, we will try to use inferSchema. Let spark to build the shema itself
# use inferSchema for small data set