In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import expr, col, desc, lower, instr

In [2]:
google_play_df = spark.read.csv('../data/googleplaystore.csv', header=True, inferSchema=True, nullValue='', sep=',')

In [3]:
schema = StructType( [
    StructField('App', StringType()),
    StructField('Translated Review', StringType()),
    StructField('Sentiment', StringType() ),
    StructField('Sentiment_Polarity', DoubleType()),
    StructField('Sentiment_Subjectivity', DoubleType())
] )
app_reviews_df = spark.read.csv('../data/googleplaystore_user_reviews.csv', sep=',', schema=schema, header=True)

In [4]:
google_play_df = google_play_df.withColumn('Price', expr('double(replace(Price,"\$",""))'))
google_play_df = google_play_df.withColumn('Installs', expr('double(replace(replace(Installs, "+", ""), ",", ""))') )
google_play_df=google_play_df.withColumn('Size', expr('double(replace(replace(Size,"M","000"),"K",""))')) # Nota, sin en vez de K fuese k se evitarían valores nulos.
google_play_df=google_play_df.drop('Android Ver')
google_play_df=google_play_df.drop('Genres')
google_play_df=google_play_df.drop('Last Updated')
google_play_df=google_play_df.drop('Current Ver')

# 1. Extrae en un Dataframe el nombre y categoría de todas aquellas aplicaciones que tienen una media de valoraciones mayor o igual 4.2

In [5]:
aplicaciones_muy_valoradas_df = google_play_df.select( col('App').alias('Nombre'), col('Category').alias('Categoría'), col('Rating').alias('Valoracion')).filter(expr('Valoracion >= 4.2')) 
recuento = aplicaciones_muy_valoradas_df.count()
aplicaciones_muy_valoradas_df.show(recuento)
recuento

+----------------------------------+-------------------+----------+
|                            Nombre|          Categoría|Valoracion|
+----------------------------------+-------------------+----------+
|              U Launcher Lite –...|     ART_AND_DESIGN|       4.7|
|              Sketch - Draw & P...|     ART_AND_DESIGN|       4.5|
|              Pixel Draw - Numb...|     ART_AND_DESIGN|       4.3|
|              Paper flowers ins...|     ART_AND_DESIGN|       4.4|
|              Garden Coloring Book|     ART_AND_DESIGN|       4.4|
|              Kids Paint Free -...|     ART_AND_DESIGN|       4.7|
|              Text on Photo - F...|     ART_AND_DESIGN|       4.4|
|              Name Art Photo Ed...|     ART_AND_DESIGN|       4.4|
|              Tattoo Name On My...|     ART_AND_DESIGN|       4.2|
|              Mandala Coloring ...|     ART_AND_DESIGN|       4.6|
|              3D Color Pixel by...|     ART_AND_DESIGN|       4.4|
|              Photo Designer - ...|     ART_AND

6092

# 2. Cuenta cuántas aplicaciones ocupan más de 10 Megabytes (1 Megabyte es aproximadamente 1000 Kilobytes)

In [6]:
google_play_df.filter(col('Size') > 10000).count()

5085

# 3. Obtén un Dataframe con la valoración media para cada Type, para cada Category y para cada Content Rating, eliminando todos los valores null de las columnas Type y Category.

In [7]:
google_play_medias_df = google_play_df.dropna('any', subset=['Type', 'Category', 'Content Rating']).groupBy(col('Type'), col('Category'), col('Content Rating')).agg(expr('mean(Rating)')).orderBy('Type', 'Category', 'Content Rating')
recuento = google_play_medias_df.count()
google_play_medias_df.show(recuento)
recuento

+----+-------------------+---------------+------------------+
|Type|           Category| Content Rating|      mean(Rating)|
+----+-------------------+---------------+------------------+
|Free|     ART_AND_DESIGN|       Everyone| 4.325454545454545|
|Free|     ART_AND_DESIGN|   Everyone 10+|               4.7|
|Free|     ART_AND_DESIGN|           Teen| 4.466666666666666|
|Free|  AUTO_AND_VEHICLES|       Everyone| 4.182857142857143|
|Free|  AUTO_AND_VEHICLES|   Everyone 10+|               4.3|
|Free|  AUTO_AND_VEHICLES|           Teen|               4.2|
|Free|             BEAUTY|       Everyone| 4.287179487179485|
|Free|             BEAUTY|   Everyone 10+|              null|
|Free|             BEAUTY|     Mature 17+|               4.5|
|Free|             BEAUTY|           Teen|               4.0|
|Free|BOOKS_AND_REFERENCE|       Everyone| 4.355633802816902|
|Free|BOOKS_AND_REFERENCE|   Everyone 10+| 4.459999999999999|
|Free|BOOKS_AND_REFERENCE|     Mature 17+| 4.166666666666667|
|Free|BO

168

# 4. Crea una tabla de contingencia entre Content Rating y Type, eliminando los valores nulos de Type.

In [8]:
google_play_df.dropna('any', subset=['Type']).crosstab('Content Rating', 'Type').orderBy("Content Rating_Type").show()

+-------------------+----+----+
|Content Rating_Type|Free|Paid|
+-------------------+----+----+
|    Adults only 18+|   3|   0|
|           Everyone|8020| 695|
|       Everyone 10+| 380|  33|
|         Mature 17+| 479|  20|
|               Teen|1156|  52|
|            Unrated|   2|   0|
+-------------------+----+----+



# 5. Crea un Dataframe con el nombre de todas aquellas aplicaciones con una valoración por encima de la media de su categoría.

In [9]:
valoraciones_medias_df = google_play_df.groupBy(col('Category')).agg(expr('mean(Rating) as MediaDeCategoria'))
google_play_con_media_df = google_play_df.join(valoraciones_medias_df, google_play_df.Category == valoraciones_medias_df.Category, how='left').filter(expr('Rating >= MediaDeCategoria'))
recuento = google_play_con_media_df.count()
google_play_con_media_df.show(recuento)
#google_play_con_media_df.drop(valoraciones_medias_df.MediaDeCategoria).show(recuento) # Sin la columna de comprobación
#google_play_con_media_df.select(col('App')).dropDuplicates(subset=['App']) .sort('App').show(recuento) # Por si sólo se quieren los nombres

+----------------------------------+-------------------+------+--------+--------+---------+----+------+---------------+-------------------+------------------+
|                               App|           Category|Rating| Reviews|    Size| Installs|Type| Price| Content Rating|           Category|  MediaDeCategoria|
+----------------------------------+-------------------+------+--------+--------+---------+----+------+---------------+-------------------+------------------+
|              U Launcher Lite –...|     ART_AND_DESIGN|   4.7|   87510|     8.7|5000000.0|Free|   0.0|       Everyone|     ART_AND_DESIGN| 4.358064516129031|
|              Sketch - Draw & P...|     ART_AND_DESIGN|   4.5|  215644| 25000.0|    5.0E7|Free|   0.0|           Teen|     ART_AND_DESIGN| 4.358064516129031|
|              Paper flowers ins...|     ART_AND_DESIGN|   4.4|     167|     5.6|  50000.0|Free|   0.0|       Everyone|     ART_AND_DESIGN| 4.358064516129031|
|              Garden Coloring Book|     ART_A

# 6. Devuelve el nombre y categoría de las aplicaciones con más de 300 valoraciones que tengan una polaridad con un grado de positividad igual a 1.

In [10]:
aplicaciones_muy_apreciadas_df = google_play_df.join(app_reviews_df, (google_play_df.App == app_reviews_df.App) & (google_play_df.Reviews > 300)& (app_reviews_df.Sentiment_Polarity == 1)).drop(app_reviews_df.App).sort('Category','App')
recuento = aplicaciones_muy_apreciadas_df.select(col('App'), col('Category')).dropDuplicates(subset=['App', 'Category']).count()
aplicaciones_muy_apreciadas_df.select(col('App'), col('Category')).dropDuplicates(subset=['App', 'Category']).withColumnRenamed('App','Nombre').show(recuento) # Los muestro todos

+--------------------+-------------------+
|              Nombre|           Category|
+--------------------+-------------------+
|Anime Manga Color...|     ART_AND_DESIGN|
|Boys Photo Editor...|     ART_AND_DESIGN|
| Coloring book moana|     ART_AND_DESIGN|
|  Floor Plan Creator|     ART_AND_DESIGN|
|Garden Coloring Book|     ART_AND_DESIGN|
|AutoScout24 Switz...|  AUTO_AND_VEHICLES|
|     BEST CAR SOUNDS|  AUTO_AND_VEHICLES|
|DMV Permit Practi...|  AUTO_AND_VEHICLES|
|Fuelio: Gas log &...|  AUTO_AND_VEHICLES|
|BestCam Selfie-se...|             BEAUTY|
|Dresses Ideas & F...|             BEAUTY|
|  Filters for B Live|             BEAUTY|
|  Filters for Selfie|             BEAUTY|
|Hairstyles step b...|             BEAUTY|
|AlReader -any tex...|BOOKS_AND_REFERENCE|
|  All Maths Formulas|BOOKS_AND_REFERENCE|
|            Ancestry|BOOKS_AND_REFERENCE|
|               Bible|BOOKS_AND_REFERENCE|
|        Ebook Reader|BOOKS_AND_REFERENCE|
|English-Myanmar D...|BOOKS_AND_REFERENCE|
|FBReader: 

# 7. Obtén el nombre único de todas las aplicaciones que contiene la subcadena great en alguna de sus opiniones que han sido traducidas al inglés. 
Para ello, emplea la función lower y instr para ese propósito (https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#modulepyspark.sql.functions)

In [11]:
google_play_con_revisiones_df = google_play_df.join(app_reviews_df, (google_play_df.App == app_reviews_df.App)).drop(app_reviews_df.App)
opiniones_great_df = google_play_con_revisiones_df.filter(instr(lower(col('Translated Review')),'great')>0)
recuento = opiniones_great_df.select('App').dropDuplicates(subset=['App']).sort('App').count()
opiniones_great_df.select('App').dropDuplicates(subset=['App']).sort('App').show(recuento)

+--------------------+
|                 App|
+--------------------+
|10 Best Foods for...|
|1800 Contacts - L...|
|1LINE – One Line ...|
|2018Emoji Keyboar...|
|21-Day Meditation...|
|2Date Dating App,...|
|2GIS: directory &...|
|           2RedBeans|
|2ndLine - Second ...|
|30 Day Fitness Ch...|
|365Scores - Live ...|
|          4 in a Row|
|4K Wallpapers and...|
|7 Cups: Anxiety &...|
|7 Day Food Journa...|
|    7 Minute Workout|
|7 Weeks - Habit &...|
|         8 Ball Pool|
|850 Sports News D...|
|8fit Workouts & M...|
|A Call From Santa...|
|        A Word A Day|
|A&E - Watch Full ...|
|A+ Gallery - Phot...|
|           A+ Mobile|
|ABC Kids - Tracin...|
|ABC News - US & W...|
|  ABC Preschool Free|
|        ABCmouse.com|
|AC - Tips & News ...|
|           ACE Elite|
|AP Mobile - Break...|
|APE Weather ( Liv...|
|       ARY NEWS URDU|
|                ASOS|
|AT&T Navigator: M...|
|AVG Cleaner – Spe...|
|Abs Training-Burn...|
|Accounting App - ...|
|AccuWeather: Dail...|
|Acorn TV: 

# 8. Realiza un análisis de correlación entre Rating y Size.

In [12]:
coeficiente= google_play_df.select(expr('corr(Rating, Size)')).head()[0]
if(coeficiente > 0):
    print('La correlación es positiva.')
else:
    print('La correlacion es negativa.')

if(abs(coeficiente)<0.3): # Por poner un límite. Fuente: https://ice.unizar.es/sites/ice.unizar.es/files/users/leteo/materiales/01._documento_1_correlaciones.pdf
    print('Pero su valor es muy bajo: ', coeficiente,'. Casi despreciable. Se puede decir que no hay correlación.', sep='')
else:
    print('Tiene un valor de ', coeficiente)

La correlación es positiva.
Pero su valor es muy bajo: 0.07568995671515161. Casi despreciable. Se puede decir que no hay correlación.


# 9. Obtén la polaridad de sentimiento media, y su desviación típica para cada posible valor de Content Rating.

In [13]:
google_play_con_revisiones_df.groupBy(col('Content Rating')).agg(expr('mean(Sentiment_Polarity)'), expr('stddev_samp(Sentiment_Polarity)')).show()

+---------------+------------------------+-------------------------------+
| Content Rating|mean(Sentiment_Polarity)|stddev_samp(Sentiment_Polarity)|
+---------------+------------------------+-------------------------------+
|           Teen|     0.12102082880135905|            0.30982467421170223|
|     Mature 17+|      0.1685808474562754|             0.3678941965284481|
|   Everyone 10+|     0.13200213901805047|             0.3109907022038593|
|       Everyone|      0.1647769803135702|            0.33459519837815016|
|Adults only 18+|     0.31233355379188715|              0.334873921904287|
+---------------+------------------------+-------------------------------+



# 10. Registra el Dataframe google_play_df como la tabla temporal GooglePlay.

In [14]:
spark.sql('USE pydb')
google_play_df.withColumnRenamed('Content Rating', 'Content_Rating').createOrReplaceTempView('GooglePlay')
spark.sql('SELECT * FROM GooglePlay').show(3)

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content_Rating|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19000.0|  10000.0|Free|  0.0|      Everyone|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|14000.0| 500000.0|Free|  0.0|      Everyone|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|    8.7|5000000.0|Free|  0.0|      Everyone|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
only showing top 3 rows



# 11. Registra el Dataframe app_reviews_df como la tabla temporal Reviews.

In [15]:
app_reviews_df.withColumnRenamed('Translated Review', 'Translated_Review').createOrReplaceTempView('Reviews')
spark.sql('SELECT * FROM Reviews').show(3)

+--------------------+--------------------+---------+------------------+----------------------+
|                 App|   Translated_Review|Sentiment|Sentiment_Polarity|Sentiment_Subjectivity|
+--------------------+--------------------+---------+------------------+----------------------+
|10 Best Foods for...|I like eat delici...| Positive|               1.0|    0.5333333333333333|
|10 Best Foods for...|This help eating ...| Positive|              0.25|   0.28846153846153844|
|10 Best Foods for...|                null|     null|              null|                  null|
+--------------------+--------------------+---------+------------------+----------------------+
only showing top 3 rows



# 12. Repite la consulta 1 con SQL. 
(Extrae en un Dataframe el nombre y categoría de todas aquellas aplicaciones que tienen una media de valoraciones mayor o igual 4.2)

In [16]:
aplicaciones_muy_valoradas_sql_df = spark.sql('SELECT App as Nombre, Category as Categoria, Rating as Valoracion FROM GooglePlay WHERE Rating >= 4.2')
recuento = aplicaciones_muy_valoradas_sql_df.count()
aplicaciones_muy_valoradas_sql_df.show(recuento)
recuento # Como control para ver que devuelve el mismo número de resultados

+----------------------------------+-------------------+----------+
|                            Nombre|          Categoria|Valoracion|
+----------------------------------+-------------------+----------+
|              U Launcher Lite –...|     ART_AND_DESIGN|       4.7|
|              Sketch - Draw & P...|     ART_AND_DESIGN|       4.5|
|              Pixel Draw - Numb...|     ART_AND_DESIGN|       4.3|
|              Paper flowers ins...|     ART_AND_DESIGN|       4.4|
|              Garden Coloring Book|     ART_AND_DESIGN|       4.4|
|              Kids Paint Free -...|     ART_AND_DESIGN|       4.7|
|              Text on Photo - F...|     ART_AND_DESIGN|       4.4|
|              Name Art Photo Ed...|     ART_AND_DESIGN|       4.4|
|              Tattoo Name On My...|     ART_AND_DESIGN|       4.2|
|              Mandala Coloring ...|     ART_AND_DESIGN|       4.6|
|              3D Color Pixel by...|     ART_AND_DESIGN|       4.4|
|              Photo Designer - ...|     ART_AND

6092

# 13. Repite la consulta 3 con SQL. 
(Obtén un Dataframe con la valoración media para cada Type, para cada Category y para cada Content Rating, eliminando todos los valores null de las columnas Type y Category.)

In [17]:
google_play_sql_df = spark.sql('SELECT Type, Category, Content_Rating, Rating FROM GooglePlay')
google_play_medias_sql_df = google_play_sql_df.dropna('any', subset=['Type', 'Category', 'Content_Rating']).groupBy(col('Type'), col('Category'), col('Content_Rating')).agg(expr('mean(Rating)')).orderBy('Type', 'Category', 'Content_Rating')
recuento = google_play_medias_sql_df.count()
google_play_medias_sql_df.show(recuento)
recuento

+----+-------------------+---------------+------------------+
|Type|           Category| Content_Rating|      mean(Rating)|
+----+-------------------+---------------+------------------+
|Free|     ART_AND_DESIGN|       Everyone| 4.325454545454545|
|Free|     ART_AND_DESIGN|   Everyone 10+|               4.7|
|Free|     ART_AND_DESIGN|           Teen| 4.466666666666666|
|Free|  AUTO_AND_VEHICLES|       Everyone| 4.182857142857143|
|Free|  AUTO_AND_VEHICLES|   Everyone 10+|               4.3|
|Free|  AUTO_AND_VEHICLES|           Teen|               4.2|
|Free|             BEAUTY|       Everyone| 4.287179487179485|
|Free|             BEAUTY|   Everyone 10+|              null|
|Free|             BEAUTY|     Mature 17+|               4.5|
|Free|             BEAUTY|           Teen|               4.0|
|Free|BOOKS_AND_REFERENCE|       Everyone| 4.355633802816902|
|Free|BOOKS_AND_REFERENCE|   Everyone 10+| 4.459999999999999|
|Free|BOOKS_AND_REFERENCE|     Mature 17+| 4.166666666666667|
|Free|BO

168

# 14. Repite la consulta 9 con SQL.
(Obtén la polaridad de sentimiento media, y su desviación típica para cada posible valor de Content Rating.)

In [18]:
google_play_con_revisiones_sql_df = spark.sql('SELECT g.Content_Rating, r.Sentiment_Polarity FROM GooglePlay AS g INNER JOIN Reviews AS r ON g.App = r.App')
google_play_con_revisiones_df.groupBy(col('Content Rating')).agg(expr('mean(Sentiment_Polarity)'), expr('stddev_samp(Sentiment_Polarity)')).show()


+---------------+------------------------+-------------------------------+
| Content Rating|mean(Sentiment_Polarity)|stddev_samp(Sentiment_Polarity)|
+---------------+------------------------+-------------------------------+
|           Teen|     0.12102082880135905|            0.30982467421170223|
|     Mature 17+|      0.1685808474562754|             0.3678941965284481|
|   Everyone 10+|     0.13200213901805047|             0.3109907022038593|
|       Everyone|      0.1647769803135702|            0.33459519837815016|
|Adults only 18+|     0.31233355379188715|              0.334873921904287|
+---------------+------------------------+-------------------------------+

