In [89]:
from pyspark.sql.types import *
from pyspark.sql.functions import expr, col, desc, lower, instr

In [90]:
google_play_df = spark.read.csv('../data/googleplaystore.csv', header=True, inferSchema=True, nullValue='', sep=',')

In [91]:
schema = StructType( [
    StructField('App', StringType()),
    StructField('Translated Review', StringType()),
    StructField('Sentiment', StringType() ),
    StructField('Sentiment_Polarity', DoubleType()),
    StructField('Sentiment_Subjectivity', DoubleType())
] )
app_reviews_df = spark.read.csv('../data/googleplaystore_user_reviews.csv', sep=',', schema=schema, header=True)

In [92]:
google_play_df = google_play_df.withColumn('Price', expr('double(replace(Price,"\$",""))'))
google_play_df = google_play_df.withColumn('Installs', expr('double(replace(replace(Installs, "+", ""), ",", ""))') )
google_play_df=google_play_df.withColumn('Size', expr('double(replace(replace(Size,"M","000"),"K",""))'))
google_play_df=google_play_df.drop('Android Ver')
google_play_df=google_play_df.drop('Genres')
google_play_df=google_play_df.drop('Last Updated')
google_play_df=google_play_df.drop('Current Ver')

Ejercicio1. Extraer nombre y categoria de todas aquellas aplicaciones con media valoraciones mayor o igual 4.2.
Para ello se crea un dataframe (filtro) que contenga las valoraciones mayor o igual a 4.2, se imprime y luego se usa ese dataframe para crear otro (ejer1) para extraer solo los campos App y Category

In [93]:
filtro = google_play_df.filter( expr('Rating>4.2 OR Rating=4.2') )
filtro.show(5)
ejer1 = filtro.select('App', 'Category')
ejer1.show(5)

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|    8.7|5000000.0|Free|  0.0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|25000.0|    5.0E7|Free|  0.0|          Teen|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|    2.8| 100000.0|Free|  0.0|      Everyone|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    5.6|  50000.0|Free|  0.0|      Everyone|
|Garden Coloring Book|ART_AND_DESIGN|   4.4|  13791|33000.0|1000000.0|Free|  0.0|      Everyone|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
only showing top 5 rows

+--------------------+--------------+
|                 App|      Category|
+--------------------+----

Ejercicio2. Contar cuantas aplicaciones ocupan mas de 10 MB (aproximadamente 1000 Kilobytes).

In [94]:
ejer2 = google_play_df.filter( expr('Size>10000') )
ejer2.show(5)
print(ejer2.count())

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19000.0|  10000.0|Free|  0.0|      Everyone|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|14000.0| 500000.0|Free|  0.0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|25000.0|    5.0E7|Free|  0.0|          Teen|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|19000.0|  50000.0|Free|  0.0|      Everyone|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815|29000.0|1000000.0|Free|  0.0|      Everyone|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
only showing top 5 rows

5085


Ejercicio3. Dataframe con la valoracion media para cada Type, Categoria y para Content Rating, eliminando todos los valores null de las columnas Type y Category

In [95]:
eliminar = google_play_df.dropna('any', subset=['Type', 'Category'])

In [96]:
eliminar.groupBy(col('Type')).agg(expr('mean(Rating)') ).show(5)
eliminar.groupBy(col('Category')).agg(expr('mean(Rating)') ).show(5)
eliminar.groupBy(col('Content Rating')).agg(expr('mean(Rating)') ).show(5)
eliminar.groupBy(col('Category'), col('Type'), col("Content Rating")).agg(expr('mean(Rating)')).show(5)

+----+-----------------+
|Type|     mean(Rating)|
+----+-----------------+
|Free|4.186202546163562|
|Paid|4.266615146831529|
+----+-----------------+

+-------------+-----------------+
|     Category|     mean(Rating)|
+-------------+-----------------+
|       EVENTS|4.435555555555557|
|       COMICS|4.155172413793104|
|       SPORTS|4.223510971786835|
|      WEATHER|4.243999999999999|
|VIDEO_PLAYERS|4.063750000000001|
+-------------+-----------------+
only showing top 5 rows

+--------------+-----------------+
|Content Rating|     mean(Rating)|
+--------------+-----------------+
|       Unrated|              4.1|
|          Teen|4.233487084870853|
|    Mature 17+|4.123427331887204|
|  Everyone 10+|4.257178841309818|
|      Everyone|4.186374663072782|
+--------------+-----------------+
only showing top 5 rows

+-------------------+----+--------------+-----------------+
|           Category|Type|Content Rating|     mean(Rating)|
+-------------------+----+--------------+-----------------

Ejercicio4. Tabla de contingencia entre Content Rating y Type, eliminando todos los valores null de Type.

In [97]:
eliminar5 = google_play_df.dropna('any', subset=['Type'])
eliminar5.crosstab('Category', 'Type').show(n=100)


+-------------------+----+----+
|      Category_Type|Free|Paid|
+-------------------+----+----+
|BOOKS_AND_REFERENCE| 203|  28|
|               null|   1|   0|
|          EDUCATION| 152|   4|
|MAPS_AND_NAVIGATION| 132|   5|
|           SHOPPING| 258|   2|
|   TRAVEL_AND_LOCAL| 246|  12|
|     HOUSE_AND_HOME|  88|   0|
|        PHOTOGRAPHY| 313|  22|
|               GAME|1061|  83|
|      VIDEO_PLAYERS| 171|   4|
|              TOOLS| 765|  78|
| NEWS_AND_MAGAZINES| 281|   2|
|            FINANCE| 349|  17|
|             SOCIAL| 292|   3|
|            WEATHER|  74|   8|
|           BUSINESS| 446|  14|
|    PERSONALIZATION| 309|  83|
|          PARENTING|  58|   2|
|     ART_AND_DESIGN|  62|   3|
|     FOOD_AND_DRINK| 125|   2|
|             COMICS|  60|   0|
|      COMMUNICATION| 360|  27|
|  AUTO_AND_VEHICLES|  82|   3|
|             BEAUTY|  53|   0|
|             EVENTS|  63|   1|
|       PRODUCTIVITY| 396|  28|
|             SPORTS| 360|  24|
|            MEDICAL| 354| 109|
|       

Ejercicio 5.Calculamos la media y se guarda. Da error a la hora de ponerla en el filtro el nombre de media_5.

In [98]:
from pyspark.sql.functions import mean, stddev_samp, var_samp
media_5=google_play_df.select( mean(google_play_df.Rating).alias('mean'))
media_5.show()
ejer5 = google_play_df.filter( expr('Rating>4.1915') )
ejer5.show(5)

+------------------+
|              mean|
+------------------+
|4.1915127575531175|
+------------------+

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|    8.7|5000000.0|Free|  0.0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|25000.0|    5.0E7|Free|  0.0|          Teen|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|    2.8| 100000.0|Free|  0.0|      Everyone|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    5.6|  50000.0|Free|  0.0|      Everyone|
|Garden Coloring Book|ART_AND_DESIGN|   4.4|  13791|33000.0|1000000.0|Free|  0.0|      Everyone|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
only showing top 5 ro

Ejercicio6. Devuelve el nombre y categoría de las aplicaciones con más de 300 valoraciones que tengan una polaridad con un grado de positividad igual a 1.

In [99]:
union= app_reviews_df.join( google_play_df, app_reviews_df.App == google_play_df.App, how='inner' )
ejer6= union.filter(expr(("Reviews>500 AND Sentiment_Polarity=1")))
ejer6.show(5)


+--------------------+--------------------+---------+------------------+----------------------+--------------------+------------------+------+-------+----+--------+----+-----+--------------+
|                 App|   Translated Review|Sentiment|Sentiment_Polarity|Sentiment_Subjectivity|                 App|          Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|
+--------------------+--------------------+---------+------------------+----------------------+--------------------+------------------+------+-------+----+--------+----+-----+--------------+
|10 Best Foods for...|I like eat delici...| Positive|               1.0|    0.5333333333333333|10 Best Foods for...|HEALTH_AND_FITNESS|   4.0|   2490| 3.8|500000.0|Free|  0.0|  Everyone 10+|
|10 Best Foods for...|I like eat delici...| Positive|               1.0|    0.5333333333333333|10 Best Foods for...|HEALTH_AND_FITNESS|   4.0|   2490| 3.8|500000.0|Free|  0.0|  Everyone 10+|
|10 Best Foods for...|        Best idea us| P

Ejer7. Obtén el nombre único de todas las aplicaciones que contiene la subcadena great en alguna de sus opiniones que han sido traducidas al inglés. Para ello, emplea la función lower y instr para ese propósito (https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#modulepyspark.sql.functions)

8. Realiza un análisis de correlación entre Rating y Size. Se carga de la libreria pyspark.sql.functions la funcion corr y se selecciona esos campos.

In [100]:
from pyspark.sql.functions import corr
google_play_df.select( corr(google_play_df.Rating, google_play_df.Size) ).show()

+-------------------+
| corr(Rating, Size)|
+-------------------+
|0.07568995671515161|
+-------------------+



9. Obtén la polaridad de sentimiento media, y su desviación típica para cada posible valor de Content Rating.

In [101]:
from pyspark.sql.functions import mean, stddev_pop
union.groupBy(col('Content Rating')).agg(expr('mean(Sentiment_Polarity)'), expr('stddev_pop(Sentiment_Polarity)') ).show()

+---------------+------------------------+------------------------------+
| Content Rating|mean(Sentiment_Polarity)|stddev_pop(Sentiment_Polarity)|
+---------------+------------------------+------------------------------+
|           Teen|     0.12102082880135945|            0.3098101363014476|
|     Mature 17+|     0.16858084745627555|           0.36785313461677044|
|   Everyone 10+|     0.13200213901805052|           0.31093905548720285|
|       Everyone|     0.16477698031357257|            0.3345921181710349|
|Adults only 18+|     0.31233355379188715|            0.3301901398775387|
+---------------+------------------------+------------------------------+



10. Registra el Dataframe google_play_df como la tabla temporal GooglePlay.

In [102]:
google_play_df.createOrReplaceTempView('GooglePlay')


11. Registra el Dataframe app_reviews_df como la tabla temporal Reviews.

In [103]:
app_reviews_df.createOrReplaceTempView('Reviews')

12. Repite la consulta 1 con SQL.

In [104]:
ejer12=spark.sql('SELECT App, Category FROM GooglePlay WHERE Rating>4.2 OR Rating=4.2')
ejer12.show()

+--------------------+--------------+
|                 App|      Category|
+--------------------+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|
|Sketch - Draw & P...|ART_AND_DESIGN|
|Pixel Draw - Numb...|ART_AND_DESIGN|
|Paper flowers ins...|ART_AND_DESIGN|
|Garden Coloring Book|ART_AND_DESIGN|
|Kids Paint Free -...|ART_AND_DESIGN|
|Text on Photo - F...|ART_AND_DESIGN|
|Name Art Photo Ed...|ART_AND_DESIGN|
|Tattoo Name On My...|ART_AND_DESIGN|
|Mandala Coloring ...|ART_AND_DESIGN|
|3D Color Pixel by...|ART_AND_DESIGN|
|Photo Designer - ...|ART_AND_DESIGN|
|350 Diy Room Deco...|ART_AND_DESIGN|
|FlipaClip - Carto...|ART_AND_DESIGN|
|        ibis Paint X|ART_AND_DESIGN|
|Superheroes Wallp...|ART_AND_DESIGN|
|HD Mickey Minnie ...|ART_AND_DESIGN|
|Harley Quinn wall...|ART_AND_DESIGN|
|Colorfit - Drawin...|ART_AND_DESIGN|
|Pink Silver Bow K...|ART_AND_DESIGN|
+--------------------+--------------+
only showing top 20 rows



13. Repite la consulta 3 con SQL. Se eligen los campos sin valores vacios "". Me da error al cargar Content Rating, me imagino que es por el espacio...No me sale la media


In [105]:
ejer13=spark.sql('SELECT Type, Category, "Content Rating" FROM GooglePlay WHERE Type<>"" OR Category<>""')
ejer13.show()
ejer13.groupBy('Category').agg(expr('mean(Category)')).show()


+----+--------------+--------------+
|Type|      Category|Content Rating|
+----+--------------+--------------+
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
|Free|ART_AND_DESIGN|Content Rating|
+----+--------------+--------------+
only showing top 20 rows

+-------------------+--------------+
|           Category|mean(Category)|
+-----------

14. Repite la consulta 9 con SQL.