In [3]:
#!pip install tmdbsimple

In [5]:
#!pip install elasticsearch 

In [11]:
# !pip install numpy pandas sklearn

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [1]:
from IPython.display import Image, HTML, display
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        getOrCreate()

In [61]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, lit


# появились конфиги - что это?

spark.conf.set("es.nodes","89.253.237.183")
spark.conf.set("es.port","9200")
spark.conf.set("es.nodes.wan.only","true")
spark.conf.set("es.index.auto.create", "true")


spark.conf.set('spark.jars.packages', 'elasticsearch-spark-20_2.11-7.12.0.jar')
#spark.conf.set('spark.jars.packages', 'elasticsearch-hadoop-7.12.0.jar')

In [3]:
# другой вариант добавить конфиг
import os
os.environ["PYSPARK_SUBMIT_ARGS"]='--jars elasticsearch-spark-20_2.11-7.12.0.jar pyspark-shell'

In [4]:
ratings = spark.read.csv("data/ratings.csv", header=True, inferSchema=True)
ratings.cache()

print("Number of ratings: {}".format(ratings.count()))
print("Sample of ratings:")
ratings.show(5)

Number of ratings: 100836
Sample of ratings:
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [5]:
# timestamp в Unix TimeStamp (добавим нули )

ratings = ratings.select(ratings.userId,
                         ratings.movieId,
                         ratings.rating,
                         (ratings.timestamp.cast("long") * 1000).alias("timestamp"))
ratings.show(5)

+------+-------+------+------------+
|userId|movieId|rating|   timestamp|
+------+-------+------+------------+
|     1|      1|   4.0|964982703000|
|     1|      3|   4.0|964981247000|
|     1|      6|   4.0|964982224000|
|     1|     47|   5.0|964983815000|
|     1|     50|   5.0|964982931000|
+------+-------+------+------------+
only showing top 5 rows



In [6]:
raw_movies = spark.read.csv("data/movies.csv", header=True, inferSchema=True)

print("Raw movie data:")
raw_movies.show(5, truncate=False)

Raw movie data:
+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



In [7]:
# udf для жанра (всё в нижний регистр)

extract_genres = udf(lambda x: x.lower().split("|"), ArrayType(StringType()))

raw_movies.select("movieId",
                  "title",
                  extract_genres("genres").alias("genres")).show(5, False)

+-------+----------------------------------+-------------------------------------------------+
|movieId|title                             |genres                                           |
+-------+----------------------------------+-------------------------------------------------+
|1      |Toy Story (1995)                  |[adventure, animation, children, comedy, fantasy]|
|2      |Jumanji (1995)                    |[adventure, children, fantasy]                   |
|3      |Grumpier Old Men (1995)           |[comedy, romance]                                |
|4      |Waiting to Exhale (1995)          |[comedy, drama, romance]                         |
|5      |Father of the Bride Part II (1995)|[comedy]                                         |
+-------+----------------------------------+-------------------------------------------------+
only showing top 5 rows



In [8]:
# сделаем оделение года выпока от описания фильма
import re
def extract_year_fn(title):
    result = re.search("\(\d{4}\)", title)
    try:
        if result:
            group = result.group()
            year = group[1:-1]
            start_pos = result.start()
            title = title[:start_pos-1]
            return (title, year)
        else:
            return (title, 1970)
    except:
        print(title)

        
extract_year = udf(extract_year_fn,\
                   StructType([StructField("title", StringType(), True),\
                               StructField("release_date", StringType(), True)]))
    
# проведем тестирования
s = "Jumanji (1995)"
extract_year_fn(s)

('Jumanji', '1995')

In [9]:
# применим функцию
movies = raw_movies.select("movieId", extract_year("title").title.alias("title"),\
                           extract_year("title").release_date.alias("release_date"),\
                           extract_genres("genres").alias("genres"))

print("Cleaned movie data:")
movies.show(5, truncate=False)

Cleaned movie data:
+-------+---------------------------+------------+-------------------------------------------------+
|movieId|title                      |release_date|genres                                           |
+-------+---------------------------+------------+-------------------------------------------------+
|1      |Toy Story                  |1995        |[adventure, animation, children, comedy, fantasy]|
|2      |Jumanji                    |1995        |[adventure, children, fantasy]                   |
|3      |Grumpier Old Men           |1995        |[comedy, romance]                                |
|4      |Waiting to Exhale          |1995        |[comedy, drama, romance]                         |
|5      |Father of the Bride Part II|1995        |[comedy]                                         |
+-------+---------------------------+------------+-------------------------------------------------+
only showing top 5 rows



In [10]:
link_data = spark.read.csv("data/links.csv", header=True, inferSchema=True)


# добавим tmdbId
movie_data = movies.join(link_data, movies.movieId == link_data.movieId)\
                   .select(movies.movieId,
                           movies.title,
                           movies.release_date,
                           movies.genres,
                           link_data.tmdbId)

num_movies = movie_data.count()

print("Cleaned movie data with tmdbId links:")
movie_data.show(5, truncate=False)

Cleaned movie data with tmdbId links:
+-------+---------------------------+------------+-------------------------------------------------+------+
|movieId|title                      |release_date|genres                                           |tmdbId|
+-------+---------------------------+------------+-------------------------------------------------+------+
|1      |Toy Story                  |1995        |[adventure, animation, children, comedy, fantasy]|862   |
|2      |Jumanji                    |1995        |[adventure, children, fantasy]                   |8844  |
|3      |Grumpier Old Men           |1995        |[comedy, romance]                                |15602 |
|4      |Waiting to Exhale          |1995        |[comedy, drama, romance]                         |31357 |
|5      |Father of the Bride Part II|1995        |[comedy]                                         |11862 |
+-------+---------------------------+------------+------------------------------------------------

In [None]:
# пример получения картинки с сервиса https://www.themoviedb.org/
try:
    import tmdbsimple as tmdb
    import json
    from requests.exceptions import HTTPError
    # получите свой ключ
    tmdb.API_KEY = '3403d7129a0d23281bf49971d87b43e0'
    print("Successfully imported tmdbsimple!")
    
    # пример URL
    IMAGE_URL = 'https://image.tmdb.org/t/p/w500'
    movie_id = movie_data.first().tmdbId
    movie_info = tmdb.Movies(movie_id).info()
    movie_poster_url = IMAGE_URL + movie_info['poster_path']
    display(Image(movie_poster_url, width=200))
    
except ImportError:
    print("Cannot import tmdbsimple as it is not installed, no movie posters will be displayed!")
    
except HTTPError as e:
    if e.response.status_code == 401:
        j = json.loads(e.response.text)
        print("TMdb API call failed: {}".format(j['status_message']))

## Elasticsearch

In [11]:
from elasticsearch import Elasticsearch

# запускаем ElasticSearch
es = Elasticsearch(hosts='89.253.237.183:9200')
es.info(pretty=True)

{'name': 'vps-11009347-124249.host4g.ru',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'tZ8R80WeRm6V4yiMpzZP2A',
 'version': {'number': '7.12.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '78722783c38caa25a70982b5b042074cde5d3b3a',
  'build_date': '2021-03-18T06:17:15.410153305Z',
  'build_snapshot': False,
  'lucene_version': '8.8.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [None]:
# установим кол-во факторов для модели
VECTOR_DIM = 20

create_ratings = {
    # маппинг на таблицу рейтингов
    "mappings": {
        "properties": {
            "timestamp": {
                "type": "date"
            },
            "userId": {
                "type": "integer"
            },
            "movieId": {
                "type": "integer"
            },
            "rating": {
                "type": "double"
            }
        }  
    }
}

create_users = {
    # маппинг на таблицу пользователей
    "mappings": {
        "properties": {
            "userId": {
                "type": "integer"
            },
            # маппни на вектор обучения для модели
            "model_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_DIM
            },
            "model_version": {
                "type": "keyword"
            },
            "model_timestamp": {
                "type": "date"
            }
        }
    }
}

create_movies = {
    # маппинг на фильмы
    "mappings": {
        "properties": {
            "movieId": {
                "type": "integer"
            },
            "tmdbId": {
                "type": "keyword"
            },
            "genres": {
                "type": "keyword"
            },
            "release_date": {
                "type": "date",
                "format": "year"
            },
            # маппинг на вектор обучения для фильмов
            "model_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_DIM
            },
            "model_version": {
                "type": "keyword"
            },
            "model_timestamp": {
                "type": "date"
            }          
        }
    }
}

# создадим индексы для маппингов (таблицы)
res_ratings = es.indices.create(index="ratings", body=create_ratings)
res_users = es.indices.create(index="users", body=create_users)
res_movies = es.indices.create(index="movies", body=create_movies)

print("Created indices:")
print(res_ratings)
print(res_users)
print(res_movies)

In [30]:
# что мы создали?
create_ratings

{'mappings': {'properties': {'timestamp': {'type': 'date'},
   'userId': {'type': 'integer'},
   'movieId': {'type': 'integer'},
   'rating': {'type': 'double'}}}}

In [55]:
# сравним с одной строкой 
ratings.rdd.take(1)[0]

Row(userId=1, movieId=1, rating=4.0, timestamp=964982703000)

In [58]:
# пример данных
d = {'mappings': 
        {'properties': {'timestamp': ratings.rdd.take(1)[0].timestamp,
                         'userId': int(ratings.rdd.take(1)[0].userId),
                         'movieId': int(ratings.rdd.take(1)[0].movieId),
                         'rating': ratings.rdd.take(1)[0].rating}
        }
    }

In [59]:
d

{'mappings': {'properties': {'timestamp': 964982703000,
   'userId': 1,
   'movieId': 1,
   'rating': 4.0}}}

In [None]:
# запишим в ES
ratings.write.format("es")\
             .option("es.nodes",'89.253.237.183')\
             .save("ratings")

num_ratings_es = es.count(index="ratings")['count']
num_ratings_df = ratings.count()

print("Dataframe count: {}".format(num_ratings_df))
print("ES index count:  {}".format(num_ratings_es))

In [None]:
# тестируем
es.search(index="ratings", q="*", size=3)

In [None]:
es.count(index="ratings", q="timestamp:[2018-01-01 TO 2018-02-01]")

In [None]:
movie_data.write.format("es").option("es.mapping.id", "movieId").save("movies")
num_movies_df = movie_data.count()
num_movies_es = es.count(index="movies")['count']

In [None]:
es.search(index="movies", q="title:matrix", size=3)

In [None]:
ratings_from_es = spark.read.format("es").load("ratings")
ratings_from_es.show(5)

## ALS

In [91]:
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", regParam=0.02, rank=VECTOR_DIM, seed=54)
model = als.fit(ratings)
model.userFactors.show(5)
model.itemFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.2570028, 0.48...|
| 20|[-0.4637753, -2.3...|
| 30|[-0.31054392, -0....|
| 40|[0.013998926, 0.5...|
| 50|[0.093884476, -0....|
+---+--------------------+
only showing top 5 rows

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.4070635, -0.12...|
| 20|[0.6117462, 0.196...|
| 30|[1.0076349, 0.849...|
| 40|[0.35743824, -0.2...|
| 50|[0.85448134, 0.06...|
+---+--------------------+
only showing top 5 rows



In [92]:
from pyspark.sql.functions import lit, current_timestamp, unix_timestamp

ver = model.uid
ts = unix_timestamp(current_timestamp())
movie_vectors = model.itemFactors.select("id",\
                                         col("features").alias("model_factor"),\
                                         lit(ver).alias("model_version"),\
                                         ts.alias("model_timestamp"))
movie_vectors.show(5)
user_vectors = model.userFactors.select("id",\
                                        col("features").alias("model_factor"),\
                                        lit(ver).alias("model_version"),\
                                        ts.alias("model_timestamp"))
user_vectors.show(5)

+---+--------------------+----------------+---------------+
| id|        model_factor|   model_version|model_timestamp|
+---+--------------------+----------------+---------------+
| 10|[0.4070635, -0.12...|ALS_653159374be1|     1617893924|
| 20|[0.6117462, 0.196...|ALS_653159374be1|     1617893924|
| 30|[1.0076349, 0.849...|ALS_653159374be1|     1617893924|
| 40|[0.35743824, -0.2...|ALS_653159374be1|     1617893924|
| 50|[0.85448134, 0.06...|ALS_653159374be1|     1617893924|
+---+--------------------+----------------+---------------+
only showing top 5 rows

+---+--------------------+----------------+---------------+
| id|        model_factor|   model_version|model_timestamp|
+---+--------------------+----------------+---------------+
| 10|[-0.2570028, 0.48...|ALS_653159374be1|     1617893924|
| 20|[-0.4637753, -2.3...|ALS_653159374be1|     1617893924|
| 30|[-0.31054392, -0....|ALS_653159374be1|     1617893924|
| 40|[0.013998926, 0.5...|ALS_653159374be1|     1617893924|
| 50|[0.0938844

In [None]:
# запишим данные в ES
movie_vectors.write.format("es") \
    .option("es.mapping.id", "id") \
    .option("es.write.operation", "update") \
    .save("movies", mode="append")

In [None]:
# запишим данные в ES
user_vectors.write.format("es") \
    .option("es.mapping.id", "id") \
    .option("es.write.operation", "index") \
    .save("users", mode="append")

In [None]:
# поиск определенного фильма в ES
es.search(index="movies", q="force awakens")['hits']['hits'][0]