In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import expr, col, desc, lower, instr

In [3]:
google_play_df = spark.read.csv('../data/googleplaystore.csv', header=True, inferSchema=True, nullValue='', sep=',')

In [4]:
schema = StructType( [
    StructField('App', StringType()),
    StructField('Translated Review', StringType()),
    StructField('Sentiment', StringType() ),
    StructField('Sentiment_Polarity', DoubleType()),
    StructField('Sentiment_Subjectivity', DoubleType())
] )
app_reviews_df = spark.read.csv('../data/googleplaystore_user_reviews.csv', sep=',', schema=schema, header=True)

In [5]:
google_play_df = google_play_df.withColumn('Price', expr('double(replace(Price,"\$",""))'))
google_play_df = google_play_df.withColumn('Installs', expr('double(replace(replace(Installs, "+", ""), ",", ""))') )
google_play_df=google_play_df.withColumn('Size', expr('double(replace(replace(Size,"M","000"),"K",""))'))
google_play_df=google_play_df.drop('Android Ver')
google_play_df=google_play_df.drop('Genres')
google_play_df=google_play_df.drop('Last Updated')
google_play_df=google_play_df.drop('Current Ver')

### Consulta 1

In [16]:
print("Una selección de aplicaciones junto a su categoría que cumplen tener un rating mayor o igual a 4.2:")
google_play_df.filter(expr('Rating>=4.2')).select('App', 'Category').show()

Una selección de aplicaciones junto a su categoría que cumplen tener un rating mayor o igual a 4.2:
+--------------------+--------------+
|                 App|      Category|
+--------------------+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|
|Sketch - Draw & P...|ART_AND_DESIGN|
|Pixel Draw - Numb...|ART_AND_DESIGN|
|Paper flowers ins...|ART_AND_DESIGN|
|Garden Coloring Book|ART_AND_DESIGN|
|Kids Paint Free -...|ART_AND_DESIGN|
|Text on Photo - F...|ART_AND_DESIGN|
|Name Art Photo Ed...|ART_AND_DESIGN|
|Tattoo Name On My...|ART_AND_DESIGN|
|Mandala Coloring ...|ART_AND_DESIGN|
|3D Color Pixel by...|ART_AND_DESIGN|
|Photo Designer - ...|ART_AND_DESIGN|
|350 Diy Room Deco...|ART_AND_DESIGN|
|FlipaClip - Carto...|ART_AND_DESIGN|
|        ibis Paint X|ART_AND_DESIGN|
|Superheroes Wallp...|ART_AND_DESIGN|
|HD Mickey Minnie ...|ART_AND_DESIGN|
|Harley Quinn wall...|ART_AND_DESIGN|
|Colorfit - Drawin...|ART_AND_DESIGN|
|Pink Silver Bow K...|ART_AND_DESIGN|
+--------------------+----

### Consulta 2

In [15]:
print("El número de aplicaciones que ocupan más de 10Mb (10000kb) es:")
google_play_df.filter(expr('Size>10000')).count()

El número de aplicaciones que ocupan más de 10Mb (10000kb) es:


5085

### Consulta 3

In [25]:
print("Obtenemos la valoración por Type: ")
google_play_df.dropna(subset=['Type']).groupBy(col('Type')).agg(expr('mean(Rating)')).show()
# guardamos el resultado en un nuevo df para usarlo más adelante
category_mean_rating = google_play_df.dropna(subset=['Category']).groupBy(col('Category')).agg(expr('mean(Rating)'))
print("Obtenemos la valoración por Category: ")
category_mean_rating.show()
print("Obtenemos la valoración por Content Rating: ")
google_play_df.groupBy(col('Content Rating')).agg(expr('mean(Rating)')).show()

Obtenemos la valoración por Type: 
+----+-----------------+
|Type|     mean(Rating)|
+----+-----------------+
|Free|4.185940366972488|
|Paid|4.266615146831529|
+----+-----------------+

Obtenemos la valoración por Category: 
+-------------------+------------------+
|           Category|      mean(Rating)|
+-------------------+------------------+
|             EVENTS| 4.435555555555557|
|             COMICS| 4.155172413793104|
|             SPORTS| 4.223510971786835|
|            WEATHER| 4.243999999999999|
|      VIDEO_PLAYERS| 4.063750000000001|
|  AUTO_AND_VEHICLES|  4.19041095890411|
|          PARENTING| 4.300000000000001|
|      ENTERTAINMENT| 4.126174496644294|
|    PERSONALIZATION| 4.335987261146501|
| HEALTH_AND_FITNESS|4.2771043771043775|
|   TRAVEL_AND_LOCAL|  4.10929203539823|
|BOOKS_AND_REFERENCE| 4.346067415730338|
|     FOOD_AND_DRINK|4.1669724770642205|
|        PHOTOGRAPHY| 4.192113564668767|
|           BUSINESS| 4.121452145214522|
|             FAMILY| 4.1922724670864

### Consulta 4

In [22]:
print("La tabla de contingencia entre las columnas Type y Content Rating es: ")
google_play_df.dropna(subset=['Type']).crosstab('Content Rating', 'Type').show()

La tabla de contingencia entre las columnas Type y Content Rating es: 
+-------------------+----+----+
|Content Rating_Type|Free|Paid|
+-------------------+----+----+
|            Unrated|   2|   0|
|         Mature 17+| 479|  20|
|       Everyone 10+| 380|  33|
|    Adults only 18+|   3|   0|
|           Everyone|8020| 695|
|               Teen|1156|  52|
+-------------------+----+----+



### Consulta 5

In [42]:
# Utilizamos el df category_mean_rating que hemos creado previamente
print("Una muestra de las aplicaciones que superan su valoración por encima de la media de su categoría son:")
app_up_mean = google_play_df.join(category_mean_rating, on=['Category']) \
                            .filter(col('Rating')>col('mean(Rating)')) \
                            .select('App').distinct()
app_up_mean.show(truncate=False)
print("El número de aplicaciones que superan su valoración por encima de la media de su categoría es:")
app_up_mean.count()

Una muestra de las aplicaciones que superan su valoración por encima de la media de su categoría son:
+------------------------------------------+
|App                                       |
+------------------------------------------+
|Google Chrome: Fast & Secure              |
|free video calls and chat                 |
|Toddler Learning Games - Little Kids Games|
|MyChart                                   |
|Davis's Drug Guide for Nurses             |
|Mercari: The Selling App                  |
|Nigeria News NAIJ.com                     |
|Basketball Stars                          |
|C Examples                                |
|Al-Quran Al-Muallim                       |
|THE KING OF FIGHTERS-A 2012(F)            |
|A-Z Punjabi Songs & Music Videos 2018     |
|BD Hospital's                             |
|Photo Editor - BPhoto                     |
|Volvo CE Insider                          |
|Casa CF                                   |
|Chrome Canary (Unstable)                  

4902

In [58]:
# Observamos que hay duplicados, por eso justificamos el distinct de la consulta anterior
google_play_df.join(category_mean_rating, on=['Category']) \
              .filter(col('Rating')>col('mean(Rating)')).groupBy(['App', 'Category']).count().orderBy('App').filter(col('count')>=3).show(10, False)

+-------------------------------------------------+------------------+-----+
|App                                              |Category          |count|
+-------------------------------------------------+------------------+-----+
|8 Ball Pool                                      |GAME              |6    |
|Adobe Acrobat Reader                             |PRODUCTIVITY      |3    |
|AliExpress - Smarter Shopping, Better Living     |SHOPPING          |4    |
|Amazon Shopping                                  |SHOPPING          |3    |
|Angry Birds Classic                              |GAME              |5    |
|Angry Birds Rio                                  |GAME              |4    |
|Any.do: To-do list, Calendar, Reminders & Planner|PRODUCTIVITY      |3    |
|B612 - Beauty & Filter Camera                    |PHOTOGRAPHY       |3    |
|BBC News                                         |NEWS_AND_MAGAZINES|3    |
|BBW Dating & Plus Size Chat                      |DATING            |3    |

### Consulta 6

In [81]:
print("Una muestra de aplicaciones con más de 300 reviews y una polaridad de sentimiento igual a 1 (positiva): ")
google_play_df.filter(col('Reviews')>300).select('App','Category').distinct() \
    .join(app_reviews_df.select('App', 'Sentiment_Polarity') \
                        .filter(col('Sentiment_Polarity') == 1) \
                        .distinct(),
          on='App') \
    .drop('Sentiment_Polarity') \
    .show()

Una muestra de aplicaciones con más de 300 reviews y una polaridad de sentimiento igual a 1 (positiva): 
+--------------------+-------------------+
|                 App|           Category|
+--------------------+-------------------+
|Calm - Meditate, ...| HEALTH_AND_FITNESS|
|GMAT Math Flashcards|          EDUCATION|
|             Hotstar|      ENTERTAINMENT|
|Binaural Beats Me...| LIBRARIES_AND_DEMO|
|BuzzFeed: News, T...| NEWS_AND_MAGAZINES|
|       File Explorer|       PRODUCTIVITY|
|          4 in a Row|               GAME|
|Badoo - Free Chat...|             SOCIAL|
|Cameringo Lite. F...|        PHOTOGRAPHY|
|Foursquare Swarm:...|   TRAVEL_AND_LOCAL|
|        ABCmouse.com|             FAMILY|
|          Bloglovin'|             SOCIAL|
|  All Maths Formulas|BOOKS_AND_REFERENCE|
|     Baby Panda Care|             FAMILY|
|      Comedy Central|      ENTERTAINMENT|
|          Flashlight|              TOOLS|
|Candy Camera - se...|        PHOTOGRAPHY|
|Extreme Racing 2 ...|             

### Consulta 7

In [100]:
print('Una muestra de aquellas aplicaciones que tienen la cadena "great" en alguno de sus comentarios: ')
app_reviews_df.withColumn('Translated Review', lower(col('Translated Review'))) \
              .select('*',instr('Translated Review', 'great').alias('great')) \
              .filter(col('great') != 0).select('App').distinct().show(truncate=False)

Una muestra de aquellas aplicaciones que tienen la cadena "great" en alguno de sus comentarios: 
+---------------------------------------------+
|App                                          |
+---------------------------------------------+
|BaBe+ - Berita Indonesia                     |
|Basketball Stars                             |
|Davis's Drug Guide for Nurses                |
|Find&Save - Local Shopping                   |
|Floor Plan Creator                           |
|Candy Pop Story                              |
|Cricbuzz - Live Cricket Scores & News        |
|Google Earth                                 |
|Homework Planner                             |
|Epocrates Plus                               |
|Arrow.io                                     |
|CALCU™ Stylish Calculator Free               |
|FilterGrid - Cam&Photo Editor                |
|Golf GPS Rangefinder: Golf Pad               |
|BELONG Beating Cancer Together               |
|Bubble Shooter                        

### Consulta 8







In [101]:
print('La correlación entre las variables Size y Rating es: ')
google_play_df.corr('Size', 'Rating')

La correlación entre las variables Size y Rating es: 


0.10658246952806993

### Consulta 9

In [106]:
print("Se muestra la polaridad media y la desviación estándar de cada Content Rating")
google_play_df.join(app_reviews_df, on='App') \
              .distinct() \
              .select('Content Rating','Sentiment_polarity') \
              .groupBy(col('Content Rating')) \
              .agg(expr('mean(Sentiment_polarity)').alias('mean'),
                   expr('stddev_pop(Sentiment_polarity)').alias('std_pop')).show()

+---------------+-------------------+-------------------+
| Content Rating|               mean|            std_pop|
+---------------+-------------------+-------------------+
|           Teen|0.12740170930175543| 0.3208047738198846|
|     Mature 17+|0.17390891504000114| 0.3627460396030333|
|   Everyone 10+|0.11851165277729031|0.29442744445998625|
|       Everyone| 0.1773661129718786|0.34753494478603897|
|Adults only 18+| 0.2984002267573696| 0.3242708588364504|
+---------------+-------------------+-------------------+



### Consulta 10

In [107]:
spark.sql('CREATE DATABASE pydb')
spark.sql('USE pydb')

DataFrame[]

In [110]:
google_play_df.withColumnRenamed('Content Rating', 'Content_Rating').createOrReplaceTempView('GooglePlay')

### Consulta 11

In [111]:
app_reviews_df.withColumnRenamed('Translated Review', 'Translated_Review').createOrReplaceTempView('Reviews')

### Consulta 12

In [114]:
spark.sql('SELECT App, Category FROM GooglePlay WHERE Rating>=4.2').show()

+--------------------+--------------+
|                 App|      Category|
+--------------------+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|
|Sketch - Draw & P...|ART_AND_DESIGN|
|Pixel Draw - Numb...|ART_AND_DESIGN|
|Paper flowers ins...|ART_AND_DESIGN|
|Garden Coloring Book|ART_AND_DESIGN|
|Kids Paint Free -...|ART_AND_DESIGN|
|Text on Photo - F...|ART_AND_DESIGN|
|Name Art Photo Ed...|ART_AND_DESIGN|
|Tattoo Name On My...|ART_AND_DESIGN|
|Mandala Coloring ...|ART_AND_DESIGN|
|3D Color Pixel by...|ART_AND_DESIGN|
|Photo Designer - ...|ART_AND_DESIGN|
|350 Diy Room Deco...|ART_AND_DESIGN|
|FlipaClip - Carto...|ART_AND_DESIGN|
|        ibis Paint X|ART_AND_DESIGN|
|Superheroes Wallp...|ART_AND_DESIGN|
|HD Mickey Minnie ...|ART_AND_DESIGN|
|Harley Quinn wall...|ART_AND_DESIGN|
|Colorfit - Drawin...|ART_AND_DESIGN|
|Pink Silver Bow K...|ART_AND_DESIGN|
+--------------------+--------------+
only showing top 20 rows



### Consulta 13

In [115]:
spark.sql('SELECT Type, AVG(Rating) FROM GooglePlay WHERE Type IS NOT NULL GROUP BY Type').show()

+----+-----------------+
|Type|      avg(Rating)|
+----+-----------------+
|Free|4.185940366972488|
|Paid|4.266615146831529|
+----+-----------------+



In [117]:
spark.sql('SELECT Category, AVG(Rating) FROM GooglePlay WHERE Category IS NOT NULL GROUP BY Category').show()

+-------------------+------------------+
|           Category|       avg(Rating)|
+-------------------+------------------+
|             EVENTS| 4.435555555555557|
|             COMICS| 4.155172413793104|
|             SPORTS| 4.223510971786835|
|            WEATHER| 4.243999999999999|
|      VIDEO_PLAYERS| 4.063750000000001|
|  AUTO_AND_VEHICLES|  4.19041095890411|
|          PARENTING| 4.300000000000001|
|      ENTERTAINMENT| 4.126174496644294|
|    PERSONALIZATION| 4.335987261146501|
| HEALTH_AND_FITNESS|4.2771043771043775|
|   TRAVEL_AND_LOCAL|  4.10929203539823|
|BOOKS_AND_REFERENCE| 4.346067415730338|
|     FOOD_AND_DRINK|4.1669724770642205|
|        PHOTOGRAPHY| 4.192113564668767|
|           BUSINESS| 4.121452145214522|
|             FAMILY| 4.192272467086437|
|           SHOPPING| 4.259663865546221|
|     HOUSE_AND_HOME| 4.197368421052633|
|               GAME|4.2863263445761195|
|          EDUCATION| 4.389032258064517|
+-------------------+------------------+
only showing top

In [119]:
spark.sql('SELECT Content_Rating, AVG(Rating) FROM GooglePlay GROUP BY Content_Rating').show()

+---------------+------------------+
| Content_Rating|       avg(Rating)|
+---------------+------------------+
|        Unrated|               4.1|
|           Teen| 4.233487084870853|
|     Mature 17+| 4.123427331887204|
|   Everyone 10+| 4.257178841309818|
|       Everyone|4.1860665678480045|
|Adults only 18+|               4.3|
+---------------+------------------+



### Consulta 14

In [125]:
spark.sql('SELECT g.Content_Rating, MEAN(r.Sentiment_polarity), STD(r.Sentiment_polarity) \
          FROM GooglePlay as g \
          INNER JOIN Reviews as r ON g.App=r.App \
          GROUP BY Content_Rating').show()

+---------------+-----------------------+-------------------------------+
| Content_Rating|avg(Sentiment_polarity)|stddev_samp(Sentiment_polarity)|
+---------------+-----------------------+-------------------------------+
|           Teen|    0.12102082880135905|            0.30982467421170223|
|     Mature 17+|     0.1685808474562754|             0.3678941965284481|
|   Everyone 10+|    0.13200213901805047|             0.3109907022038593|
|       Everyone|     0.1647769803135702|            0.33459519837815016|
|Adults only 18+|    0.31233355379188715|              0.334873921904287|
+---------------+-----------------------+-------------------------------+

