# EDA for Movie Recommeder

Using a Jupyter notebook to cut down on time spent re-importig/loading data.

In [116]:
# Imports
import pyspark as ps
from pyspark.sql.types import StructType, IntegerType, StringType, StructField, ArrayType
import pandas as pd
import pyspark.sql.functions as f
from pyspark.ml.recommendation import ALS

In [56]:
# Start up some pyspark...
spark = (ps.sql.SparkSession.builder
             .master('local[4]')
             .appName('Recommender')
             .getOrCreate())

sc = spark.sparkContext

# Load Data
users_rdd = sc.textFile('data/users.dat').map(lambda rowstr: rowstr.split('::'))
# users_rdd.collect()

schema = StructType([
            StructField('id', StringType()),
            StructField('gender', StringType()),
            StructField('age', StringType()),
            StructField('occupation', StringType()),
            StructField('zip', StringType())])
users_df = spark.createDataFrame(users_rdd, schema)


        id gender age occupation    zip
975    976      M  35         14  89113
976    977      M  25          2  80110
977    978      M  18          0  19116
978    979      M   1         10  48073
979    980      M  25          6  92014
980    981      M  25         20  02141
981    982      F  25          9  92064
982    983      F  25         16  99224
983    984      M  50         16  92129
984    985      M  25          4  32608
985    986      F  56          0  19004
986    987      F  35         17  48098
987    988      M  50         11  48823
988    989      M  50          0  20706
989    990      M  18          6  10004
990    991      F  25          9  48103
991    992      F  35          3  02780
992    993      M  25          0  45678
993    994      M  18          2  92109
994    995      F  18          4  96803
995    996      M  25         17  98102
996    997      M   1         19  15748
997    998      M  45         20  10019
998    999      M  25         15  62558


In [44]:
def parse_row(row):
    '''
    Seperate by '|' and make ino an array
    '''
    data = row.split('::')
    # data[1] = data[1].split('(')[-1][:-1]
    data[2] = data[2].split('|')
    data.append(data[1].split('(')[-1][:-1])
    return data

movies_rdd = sc.textFile('data/movies.dat').map(parse_row)

schema = StructType([
            StructField('movie_id', StringType()),
            StructField('title', StringType()),
            StructField('genre', ArrayType(StringType())),
            StructField('year', StringType())
                    ])
movies_df = spark.createDataFrame(movies_rdd, schema)
movies_df.show()

+--------+--------------------+--------------------+----+
|movie_id|               title|               genre|year|
+--------+--------------------+--------------------+----+
|       1|    Toy Story (1995)|[Animation, Child...|1995|
|       2|      Jumanji (1995)|[Adventure, Child...|1995|
|       3|Grumpier Old Men ...|   [Comedy, Romance]|1995|
|       4|Waiting to Exhale...|     [Comedy, Drama]|1995|
|       5|Father of the Bri...|            [Comedy]|1995|
|       6|         Heat (1995)|[Action, Crime, T...|1995|
|       7|      Sabrina (1995)|   [Comedy, Romance]|1995|
|       8| Tom and Huck (1995)|[Adventure, Child...|1995|
|       9| Sudden Death (1995)|            [Action]|1995|
|      10|    GoldenEye (1995)|[Action, Adventur...|1995|
|      11|American Presiden...|[Comedy, Drama, R...|1995|
|      12|Dracula: Dead and...|    [Comedy, Horror]|1995|
|      13|        Balto (1995)|[Animation, Child...|1995|
|      14|        Nixon (1995)|             [Drama]|1995|
|      15|Cutt

In [133]:
train_df = spark.read.csv('data/training.csv', header=True, sep=',', inferSchema=True)
train_df.show()
train_df.describe().show()
train_df.printSchema()

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
|6040|  858|     4|956703932|
|6040|  593|     5|956703954|
|6040| 2384|     4|956703954|
|6040| 1961|     4|956703977|
|6040| 2019|     5|956703977|
|6040| 1419|     3|956704056|
|6040|  573|     4|956704056|
|6040| 3111|     5|956704056|
|6040|  213|     5|956704056|
|6040| 3505|     4|956704056|
|6040| 1734|     2|956704081|
|6040|  912|     5|956704191|
|6040|  919|     5|956704191|
|6040| 2503|     5|956704191|
|6040|  527|     5|956704219|
|6040|  318|     4|956704257|
|6040| 1252|     5|956704257|
|6040|  649|     5|956704257|
|6040| 3289|     5|956704305|
|6040|  759|     5|956704448|
+----+-----+------+---------+
only showing top 20 rows

+-------+------------------+-----------------+------------------+-------------------+
|summary|              user|            movie|            rating|          timestamp|
+-------+------------------+-----------------+------------------+-----------------

TypeError: 'Column' object is not callable

In [58]:
# Make a join of data...
users_df.createOrReplaceTempView('users')
movies_df.createOrReplaceTempView('movies')
train_df.createOrReplaceTempView('train')

# SQL voodoo here...
query = """
SELECT *
FROM users
LIMIT 10
"""
spark.sql(query).show()

+---+------+---+----------+-----+
| id|gender|age|occupation|  zip|
+---+------+---+----------+-----+
|  1|     F|  1|        10|48067|
|  2|     M| 56|        16|70072|
|  3|     M| 25|        15|55117|
|  4|     M| 45|         7|02460|
|  5|     M| 25|        20|55455|
|  6|     F| 50|         9|55117|
|  7|     M| 35|         1|06810|
|  8|     M| 25|        12|11413|
|  9|     M| 25|        17|61614|
| 10|     F| 35|         1|95370|
+---+------+---+----------+-----+



In [118]:
train_df = train_df.drop_duplicates()

In [106]:
mu = train_df.agg(f.avg('rating')).take(1)[0]['avg(rating)']

In [123]:
train_df.show()

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
|6036| 1280|     4|956711958|
|6036|  235|     4|956712724|
|6035|  440|     4|956712987|
|6036| 2359|     2|956717201|
|6037| 2353|     3|956718919|
|6027| 2028|     5|956726454|
|6026|  162|     5|956726748|
|6025|  196|     3|956730882|
|6036| 1722|     3|956753549|
|6036|  241|     3|956753607|
|6036| 1683|     3|956754004|
|6036| 1748|     5|956754482|
|6036| 1584|     4|956754614|
|6021| 1073|     4|956757050|
|6016| 3365|     3|956776862|
|6016| 2993|     4|956777277|
|6016|  494|     3|956777763|
|6016| 2496|     3|956780812|
|6006|  543|     1|956793406|
|6002|  599|     4|956802789|
+----+-----+------+---------+
only showing top 20 rows



In [134]:
als = ALS(
        itemCol='movie',
        userCol='user',
        ratingCol='rating',
        nonnegative=True,
        regParam=0.1,
        rank=10,
        b

In [139]:
recommender = als.fit(train_df)

In [137]:
recommender.itemFactors.show()

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[1.0157899, 0.090...|
| 20|[0.52847815, 0.38...|
| 30|[0.59129626, 0.54...|
| 40|[0.4049925, 0.353...|
| 50|[1.2174305, 0.447...|
| 60|[0.33509123, 0.50...|
| 70|[0.831674, 0.0265...|
| 80|[0.54933816, 0.42...|
| 90|[0.5714423, 0.253...|
|100|[0.56420135, 0.20...|
|110|[1.4001731, 0.425...|
|120|[0.7430254, 0.410...|
|130|[0.33988208, 0.06...|
|140|[0.33167064, 0.50...|
|150|[0.98682827, 0.39...|
|160|[0.31674203, 0.17...|
|170|[0.64632416, 0.21...|
|180|[1.2231518, 0.333...|
|190|[0.3439357, 0.351...|
|200|[0.0, 0.68877727,...|
+---+--------------------+
only showing top 20 rows



In [141]:
recommender.transform(train_df).show()

+----+-----+------+---------+----------+
|user|movie|rating|timestamp|prediction|
+----+-----+------+---------+----------+
| 673|  148|     5|975620824| 3.6031673|
|4227|  148|     2|965659724|  1.909593|
|3184|  148|     4|968708953| 3.2237186|
|4784|  148|     3|970000570| 2.9678636|
|2383|  148|     2|974417654|  2.389425|
|1242|  148|     3|974909976| 2.9519734|
|3539|  148|     3|966932408| 2.7693977|
|1069|  148|     2|974945135|  2.535724|
|1605|  148|     2|974930221| 2.2200673|
|1150|  148|     2|974875106| 2.4321134|
|3829|  148|     2|965940170| 2.2759461|
|2456|  148|     2|974178993| 2.4392943|
|2507|  148|     4|974082717|  3.146665|
|3053|  148|     3|970170090| 2.6374075|
|3841|  463|     3|966003085| 2.6202276|
|3650|  463|     2|966459084| 2.5214236|
|3151|  463|     5|968916009| 3.9031835|
|4858|  463|     3|963746396| 2.4411056|
|2629|  463|     4|973625620| 3.0589423|
|3328|  463|     4|967918151| 3.1816363|
+----+-----+------+---------+----------+
only showing top

In [142]:
train_df.describe().show()

+-------+------------------+-----------------+------------------+-------------------+
|summary|              user|            movie|            rating|          timestamp|
+-------+------------------+-----------------+------------------+-------------------+
|  count|            800000|           800000|            800000|             800000|
|   mean|      3403.0978375|    1849.25725625|        3.59047875|9.683921498700112E8|
| stddev|1546.5890280451883|1086.852485159963|1.1203761265092087|      5820930.95649|
|    min|               636|                1|                 1|          956703932|
|    max|              6040|             3952|                 5|          975767289|
+-------+------------------+-----------------+------------------+-------------------+

