- Christian David Cardenas orozco
- Miguel Angel Jimenez trochez
- Juan pablo castaño   


#Instalar Pyspark

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
!tar xf spark-3.5.4-bin-hadoop3.tgz
!pip install pyspark
!pip install py4j
!pip install findspark

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)] [0m                                                                               Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jamm

# Crear ambiente y sesion de Spark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.4-bin-hadoop3"

In [None]:
import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Taller Semana 3 Spark") \
       .getOrCreate()

Comprobar la sesion

In [None]:
spark

Instalar dependencias necesarias de pyspark sql

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import to_date
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import col, regexp_extract, avg, split

Crear la estructura del schema

In [None]:
schema = StructType([
    StructField("show_id", StringType(), True),
    StructField("type", StringType(), True),
    StructField("title", StringType(), True),
    StructField("director", StringType(), True),
    StructField("cast", StringType(), True),
    StructField("country", StringType(), True),
    StructField("date_added", StringType(), True),  # Luego se puede convertir a DateType si es necesario
    StructField("release_year", IntegerType(), True),
    StructField("rating", StringType(), True),
    StructField("duration", StringType(), True),  # Se puede dividir en minutos o temporadas si es necesario
    StructField("listed_in", StringType(), True),
    StructField("description", StringType(), True)
])

Leer el dataset de netflix y aplicarle el schema

In [None]:
df = spark.read.csv('netflix_titles.csv', header=True, sep=",", inferSchema=True, multiLine=True, schema= schema)
df.show()

In [None]:
df.printSchema()

Leer el dataset de disney+ y aplicarle el schema

In [None]:
dfd = spark.read.csv('disney_plus_titles.csv', header=True, sep=",", inferSchema=True, multiLine=True, schema=schema)
dfd.show()

In [None]:
dfd.printSchema()

Aplicar el formaato correcto de fecha a los datasets

In [None]:
df = df.withColumn("date_added", to_date(df["date_added"], "MMMM d, yyyy"))
dfd = dfd.withColumn("date_added", to_date(dfd["date_added"], "MMMM d, yyyy"))

Eliminar filas con datos nulos

In [None]:
#dfd = dfd.dropna() esta tiene muy pocos valores para borrarlos
df = df.dropna()

Analisis previo de las columnas

In [None]:
df.summary().show()

In [None]:
dfd.summary().show()


1 ¿Cuál es la proporción entre películas y series en el catálogo de Disney+?


In [None]:
conteo_tipo = dfd.groupBy("type").count()

conteo_tipo.show()

total_movies = conteo_tipo.filter(col("type") == "Movie").select("count").collect()[0][0]
total_shows = conteo_tipo.filter(col("type") == "TV Show").select("count").collect()[0][0]

proporcion = total_movies / total_shows if total_shows != 0 else "No hay series en el dataset"
print(f"Proporción Películas/Series: {proporcion:.2f}")

#Respuesta: La proporcion entre peliculas y series esta en 1052 Peliculas y 398 series


+-------+-----+
|   type|count|
+-------+-----+
|TV Show|  398|
|  Movie| 1052|
+-------+-----+

Proporción Películas/Series: 2.64


2 ¿Qué países tienen el mayor número de producciones en Disney+?

In [None]:

disney_countries = dfd.withColumn("country", split(col("country"), ", ")) \
    .selectExpr("explode(country) as country") \
    .groupBy("country").count() \
    .orderBy(col("count").desc())
disney_countries.show(10)


#El mayor numero de producciones de disney lo tiene Estados Unidos

+--------------+-----+
|       country|count|
+--------------+-----+
| United States| 1182|
|United Kingdom|  101|
|        Canada|   77|
|     Australia|   23|
|        France|   22|
|   South Korea|   13|
|         China|   10|
|         Japan|   10|
|       Germany|    9|
|       Ireland|    8|
+--------------+-----+
only showing top 10 rows



¿Quiénes son los directores con más títulos en el catálogo de Disney+?

In [None]:

# 3.
disney_directors = dfd.filter(col("director").isNotNull()) \
    .groupBy("director").count() \
    .orderBy(col("count").desc())
disney_directors.show(10)





+----------------+-----+
|        director|count|
+----------------+-----+
|     Jack Hannah|   17|
|   John Lasseter|   16|
|       Paul Hoen|   16|
| Charles Nichols|   12|
|Robert Stevenson|   12|
|Vincent McEveety|   10|
|    Bob Peterson|   10|
|     James Algar|    9|
|    Kenny Ortega|    9|
| Wilfred Jackson|    9|
+----------------+-----+
only showing top 10 rows



¿Cómo ha cambiado el número de lanzamientos de contenido en Disney+ a lo largo de los años?

In [None]:
# 4.

disney_years = dfd.withColumn("release_year", col("release_year")) \
    .groupBy("release_year").count() \
    .orderBy(col("release_year"))
disney_years.show(99)


+------------+-----+
|release_year|count|
+------------+-----+
|        NULL|    3|
|        1928|    1|
|        1932|    3|
|        1933|    3|
|        1934|    4|
|        1935|    4|
|        1936|    6|
|        1937|    6|
|        1938|    5|
|        1939|    5|
|        1940|    7|
|        1941|    7|
|        1942|    6|
|        1943|    2|
|        1944|    1|
|        1945|    1|
|        1946|    2|
|        1947|    5|
|        1948|    5|
|        1949|    5|
|        1950|    6|
|        1951|    4|
|        1952|    8|
|        1953|    6|
|        1954|    4|
|        1955|    9|
|        1956|    4|
|        1957|    4|
|        1959|    5|
|        1960|    3|
|        1961|    5|
|        1962|    3|
|        1963|    3|
|        1964|    3|
|        1965|    3|
|        1966|    2|
|        1967|    2|
|        1968|    1|
|        1969|    4|
|        1970|    1|
|        1971|    3|
|        1972|    3|
|        1973|    2|
|        1974|    4|
|        1975

¿Cuál es la duración promedio de las películas en Disney+?

In [None]:

#5

disney_movies = dfd.filter(col("type") == "Movie")

disney_movies = disney_movies.filter(col("duration").isNotNull())

disney_movies = disney_movies.withColumn("duration", regexp_extract(col("duration"), r"(\d+)", 1).cast("int"))

disney_movies.select(avg("duration").alias("Promedio_Duracion_Minutos")).show()




NameError: name 'regexp_extract' is not defined

 7 ¿Cuáles son los géneros más comunes en el catálogo de Disney+?


In [None]:

#7


disney_genres = dfd.withColumn("listed_in", split(col("listed_in"), ", ")) \
    .selectExpr("explode(listed_in) as genre") \
    .groupBy("genre").count() \
    .orderBy(col("count").desc())
disney_genres.show(10)


+----------------+-----+
|           genre|count|
+----------------+-----+
|          Family|  631|
|       Animation|  539|
|          Comedy|  525|
|Action-Adventure|  451|
|Animals & Nature|  208|
|   Coming of Age|  205|
|         Fantasy|  191|
|     Documentary|  174|
|            Kids|  141|
|           Drama|  134|
+----------------+-----+
only showing top 10 rows



In [None]:
#8

disney_us = dfd.filter(col("country").contains("United States")).count()
disney_total = dfd.count()
print(f"Proporción de contenido internacional en Disney+: {1 - (disney_us / disney_total):.2%}")


Proporción de contenido internacional en Disney+: 18.48%


# Dataset Netflix

Proporcion peliculas y series


In [None]:
#9
conteo_tipo = df.groupBy("type").count()

conteo_tipo.show()

total_movies = conteo_tipo.filter(col("type") == "Movie").select("count").collect()[0][0]
total_shows = conteo_tipo.filter(col("type") == "TV Show").select("count").collect()[0][0]

proporcion = total_movies / total_shows if total_shows != 0 else "No hay series en el dataset"
print(f"Proporción Películas/Series: {proporcion:.2f}")

In [None]:
#10
netflix_countries = df.withColumn("country", split(col("country"), ", ")) \
    .selectExpr("explode(country) as country") \
    .groupBy("country").count() \
    .orderBy(col("count").desc())
netflix_countries.show(10)



In [None]:
#11
netflix_years = df.withColumn("release_year", col("release_year")) \
    .groupBy("release_year").count() \
    .orderBy(col("release_year"))
netflix_years.show(10)


In [None]:
#12
netflix_series = df.filter(col("type") == "TV Show")

netflix_series = netflix_series.filter(col("duration").isNotNull())

netflix_series = netflix_series.withColumn("duration", regexp_extract(col("duration"), r"(\d+)", 1).cast("int"))

netflix_series.select(avg("duration").alias("Promedio_Temporadas")).show()


+-------------------+
|Promedio_Temporadas|
+-------------------+
| 1.7700074794315632|
+-------------------+



In [None]:
#13
print(f"Proporción de contenido producido en EE.UU. en Disney+: {disney_us / disney_total:.2%}")


Proporción de contenido producido en EE.UU. en Disney+: 81.52%


In [None]:
#14
netflix_actors = df.withColumn("cast", split(col("cast"), ", ")) \
    .selectExpr("explode(cast) as actor") \
    .groupBy("actor").count() \
    .orderBy(col("count").desc())
netflix_actors.show(10)


+----------------+-----+
|           actor|count|
+----------------+-----+
|     Anupam Kher|   43|
|  Shah Rukh Khan|   35|
|   Julie Tejwani|   33|
|Takahiro Sakurai|   32|
|Naseeruddin Shah|   32|
|    Rupa Bhimani|   31|
|    Akshay Kumar|   30|
|         Om Puri|   30|
|       Yuki Kaji|   29|
|Amitabh Bachchan|   28|
+----------------+-----+
only showing top 10 rows

