In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from graphframes import GraphFrame

In [2]:
spark = (SparkSession.builder
         .appName("GF-test")
         .master("local[*]")
         .config("spark.jars", "/home/jovyan/work/GraphFrames_prueba/graphframes-0.8.4-spark3.5-s_2.12.jar")
         .config("spark.driver.memory", "4g")
         .getOrCreate())

In [3]:
v = spark.createDataFrame([(1,), (2,)], ["id"])
e = spark.createDataFrame([(1, 2)], ["src", "dst"])
g = GraphFrame(v, e)
g.vertices.show()
g.edges.show()




+---+
| id|
+---+
|  1|
|  2|
+---+

+---+---+
|src|dst|
+---+---+
|  1|  2|
+---+---+



In [3]:
ratings = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet").select("userId", "filmId", "rating")
users = spark.read.parquet("/home/jovyan/work/datasets/Transformados/users_mod.parquet")
movies = spark.read.parquet("/home/jovyan/work/datasets/Transformados/movies_mod.parquet")

v_users = users.select(F.col("userId").alias("id"), F.lit("user").alias("tipo"))
v_movies = movies.select(F.col("filmId").alias("id"), F.lit("movie").alias("tipo"))
vertices = v_users.unionByName(v_movies)

edges = ratings.select(
    F.col("userId").alias("src"),
    F.col("filmId").alias("dst"),
    F.col("rating").alias("weight")
)

g = GraphFrame(vertices, edges)




In [None]:
g.vertices.show(5, truncate=False)
g.edges.show(5, truncate=False)

+---+----+
|id |tipo|
+---+----+
|1  |user|
|2  |user|
|3  |user|
|4  |user|
|5  |user|
+---+----+
only showing top 5 rows

+---+----+------+
|src|dst |weight|
+---+----+------+
|1  |1193|5     |
|1  |661 |3     |
|1  |914 |3     |
|1  |3408|4     |
|1  |2355|5     |
+---+----+------+
only showing top 5 rows



In [None]:
g.vertices.printSchema()

root
 |-- id: long (nullable = true)
 |-- tipo: string (nullable = false)



In [None]:
g.edges.printSchema()

root
 |-- src: long (nullable = true)
 |-- dst: long (nullable = true)
 |-- weight: long (nullable = true)



In [None]:
g.inDegrees.filter(F.col("id") == 1).show()



+---+--------+
| id|inDegree|
+---+--------+
|  1|    2077|
+---+--------+



In [None]:
g.inDegrees.filter(F.col("id") == 4).show()

+---+--------+
| id|inDegree|
+---+--------+
|  4|     170|
+---+--------+



In [None]:
g.outDegrees.filter(F.col("id") == 10).show()

+---+---------+
| id|outDegree|
+---+---------+
| 10|      401|
+---+---------+



In [None]:
g.outDegrees.filter(F.col("id") == 12).show()

+---+---------+
| id|outDegree|
+---+---------+
| 12|       23|
+---+---------+



In [6]:
g.outDegrees.filter(F.col("id") == 12).show()



+---+---------+
| id|outDegree|
+---+---------+
| 12|       23|
+---+---------+



In [4]:
ratings = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet").select("userId", "filmId", "rating")
users   = spark.read.parquet("/home/jovyan/work/datasets/Transformados/users_mod.parquet")
movies  = spark.read.parquet("/home/jovyan/work/datasets/Transformados/movies_mod.parquet")

ratings_small = ratings.sample(False, 0.05, seed=42)

v_users = users.select(F.col("userId").alias("id"), F.lit("user").alias("tipo"))
v_movies = movies.select(F.col("filmId").alias("id"), F.lit("movie").alias("tipo"))
vertices = v_users.unionByName(v_movies)
edges = ratings_small.select(F.col("userId").alias("src"), F.col("filmId").alias("dst"))

g = GraphFrame(vertices, edges)

spark.sparkContext.setCheckpointDir("/tmp/graphframes_chkpt")  # necesario para PageRank/CC

pr = g.pageRank(resetProbability=0.15, maxIter=10)
top_movies = (pr.vertices
              .filter("tipo='movie'")
              .select("id", "pagerank")
              .orderBy(F.desc("pagerank")))

top_movies.show(10, truncate=False)




+----+------------------+
|id  |pagerank          |
+----+------------------+
|2858|28.28089275866387 |
|593 |28.1418219581058  |
|1210|27.750826631173602|
|3408|27.63589175063968 |
|260 |25.385816160424437|
|1580|24.550018798149985|
|1193|24.124174624895293|
|3578|23.999103804673535|
|2455|21.819364242138047|
|527 |21.757631527324694|
+----+------------------+
only showing top 10 rows



In [8]:
top_movies.join(movies.select("filmId", "film"),
                top_movies.id == movies.filmId, "left") \
          .select("id", "film", "pagerank") \
          .orderBy(F.desc("pagerank")) \
          .show(10, truncate=False)


+----+------------------------------------------+------------------+
|id  |film                                      |pagerank          |
+----+------------------------------------------+------------------+
|2858|American Beauty                           |28.28089275866387 |
|593 |Silence of the Lambs, The                 |28.1418219581058  |
|1210|Star Wars: Episode VI - Return of the Jedi|27.750826631173602|
|3408|Erin Brockovich                           |27.63589175063968 |
|260 |Star Wars: Episode IV - A New Hope        |25.385816160424437|
|1580|Men in Black                              |24.550018798149985|
|1193|One Flew Over the Cuckoo's Nest           |24.124174624895293|
|3578|Gladiator                                 |23.999103804673535|
|2455|Fly, The                                  |21.819364242138047|
|527 |Schindler's List                          |21.757631527324694|
+----+------------------------------------------+------------------+
only showing top 10 rows



In [2]:
spark = (SparkSession.builder
         .appName("GF")
         .master("local[*]")
         .config("spark.jars", "/home/jovyan/work/GraphFrames_prueba/graphframes-0.8.4-spark3.5-s_2.12.jar")
         .config("spark.driver.memory", "4g")
         .getOrCreate())


In [7]:
ratings = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet") \
                    .select("userId", "filmId", "rating")
users = spark.read.parquet("/home/jovyan/work/datasets/Transformados/users_mod.parquet")
movies = spark.read.parquet("/home/jovyan/work/datasets/Transformados/movies_mod.parquet")

ratings_small = ratings.sample(False, 0.05, seed=42)  # 5% de interacciones

v_users = users.select(F.col("userId").alias("id"), F.lit("user").alias("tipo"))
v_movies = movies.select(F.col("filmId").alias("id"), F.lit("movie").alias("tipo"))
vertices = v_users.unionByName(v_movies)

edges = ratings_small.select(
    F.col("userId").alias("src"),
    F.col("filmId").alias("dst")
)

g = GraphFrame(vertices, edges)

spark.sparkContext.setCheckpointDir("/tmp/graphframes_chkpt")

cc = g.connectedComponents()

In [None]:
from pyspark.sql import functions as F

sizes = cc.groupBy("component").count().orderBy(F.desc("count"))
sizes.show(10)

# componente principal (m√°s grande)
main_comp = sizes.first()["component"]


+---------+-----+
|component|count|
+---------+-----+
|        1| 9637|
|      399|    2|
|     2191|    2|
|     3636|    2|
|      730|    2|
|      600|    2|
|     3570|    2|
|     3779|    2|
|      703|    2|
|     1319|    2|
+---------+-----+
only showing top 10 rows



In [6]:
cc.groupBy("component", "tipo").count().orderBy("component").show()


+---------+-----+-----+
|component| tipo|count|
+---------+-----+-----+
|        1| user| 5824|
|        1|movie| 3813|
|      200| user|    1|
|      200|movie|    1|
|      269| user|    1|
|      269|movie|    1|
|      298| user|    1|
|      298|movie|    1|
|      311| user|    1|
|      311|movie|    1|
|      341| user|    1|
|      341|movie|    1|
|      397| user|    1|
|      397|movie|    1|
|      399| user|    1|
|      399|movie|    1|
|      503| user|    1|
|      503|movie|    1|
|      545| user|    1|
|      545|movie|    1|
+---------+-----+-----+
only showing top 20 rows

