In [1]:
# Step 1: initialize findspark
import findspark
findspark.init()

In [2]:
# Step 2: import pyspark
import pyspark
from pyspark.sql import SparkSession
pyspark.__version__

'3.3.0'

In [3]:
# Step 3: Create a spark session

# 'local[1]' indicates spark on 1 core on the local machine, specify the number of cores needed
# use .config("spark.some.config.option", "some-value") for additional configuration

spark = SparkSession \
    .builder \
    .master('local[1]') \
    .appName("Analyzing Movielens Data") \
    .getOrCreate()

# spark

We are going to use the [MovieLens](https://grouplens.org/datasets/movielens/) dataset for these exercises. This is non trivial and should expand to about 1GB on you hard-drive.

Download and unzip [MovieLens 25M Dataset](https://grouplens.org/datasets/movielens/25m/) for this analysis.

Either ensure the data is in ```"./data/ml-25m"``` folder or update the path to the data below.

Citation:

*F. Maxwell Harper and Joseph A. Konstan.* 2015.  
The MovieLens Datasets: History and Context.  
ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>  

In [4]:
from pyspark.sql.functions import Column
genome_tags = spark.read.csv("./data/ml-25m/genome-tags.csv", header = True)
genome_tags.show()

+-----+---------------+
|tagId|            tag|
+-----+---------------+
|    1|            007|
|    2|   007 (series)|
|    3|   18th century|
|    4|          1920s|
|    5|          1930s|
|    6|          1950s|
|    7|          1960s|
|    8|          1970s|
|    9|          1980s|
|   10|   19th century|
|   11|             3d|
|   12|           70mm|
|   13|            80s|
|   14|           9/11|
|   15|        aardman|
|   16|aardman studios|
|   17|       abortion|
|   18|         absurd|
|   19|         action|
|   20|  action packed|
+-----+---------------+
only showing top 20 rows



In [5]:
movies = spark.read.csv("./data/ml-25m/movies.csv", header = True)
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [6]:
tags = spark.read.csv("./data/ml-25m/tags.csv", header=True)
results = tags.groupby('userId').count()
results.sort(results['count'].desc()).show()
results.describe().show()

+------+------+
|userId| count|
+------+------+
|  6550|183356|
| 21096| 20317|
| 62199| 13700|
|160540| 12076|
|155146| 11445|
| 70092| 10582|
|131347| 10195|
| 14116| 10167|
| 31047|  8463|
|141263|  7114|
| 64333|  6944|
| 47969|  6599|
| 15204|  6426|
| 84824|  6209|
|123527|  6005|
|148457|  5960|
| 19346|  5919|
|  6285|  5837|
| 96795|  5334|
| 44444|  4941|
+------+------+
only showing top 20 rows

+-------+-----------------+------------------+
|summary|           userId|             count|
+-------+-----------------+------------------+
|  count|            14592|             14592|
|   mean|81145.09669682017| 74.92872807017544|
| stddev|46809.75962761033|1570.0725288977699|
|    min|           100001|                 1|
|    max|            99988|            183356|
+-------+-----------------+------------------+



In [7]:
spark.stop()