In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import expr, col, desc, lower, instr
from pyspark.sql.functions import mean, stddev_samp, var_samp, avg
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import concat_ws, concat
from pyspark.sql import Row
from pyspark.sql.functions import corr

In [2]:
google_play_df = spark.read.csv('../data/googleplaystore.csv', header=True, inferSchema=True, nullValue='', sep=',')

In [3]:
schema = StructType( [
    StructField('App', StringType()),
    StructField('Translated Review', StringType()),
    StructField('Sentiment', StringType() ),
    StructField('Sentiment_Polarity', DoubleType()),
    StructField('Sentiment_Subjectivity', DoubleType())
] )
app_reviews_df = spark.read.csv('../data/googleplaystore_user_reviews.csv', sep=',', schema=schema, header=True)

In [4]:
google_play_df = google_play_df.withColumn('Price', expr('double(replace(Price,"\$",""))'))
google_play_df = google_play_df.withColumn('Installs', expr('double(replace(replace(Installs, "+", ""), ",", ""))') )
google_play_df=google_play_df.withColumn('Size', expr('double(replace(replace(Size,"M","000"),"K",""))'))
google_play_df=google_play_df.drop('Android Ver')
google_play_df=google_play_df.drop('Genres')
google_play_df=google_play_df.drop('Last Updated')
google_play_df=google_play_df.drop('Current Ver')

In [5]:
print(google_play_df)

DataFrame[App: string, Category: string, Rating: double, Reviews: int, Size: double, Installs: double, Type: string, Price: double, Content Rating: string]


In [6]:
google_play_df.show(n=10)

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19000.0|  10000.0|Free|  0.0|      Everyone|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|14000.0| 500000.0|Free|  0.0|      Everyone|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|    8.7|5000000.0|Free|  0.0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|25000.0|    5.0E7|Free|  0.0|          Teen|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|    2.8| 100000.0|Free|  0.0|      Everyone|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    5.6|  50000.0|Free|  0.0|      Everyone|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|19000.0|  50000.0|Free|  0.0|      Everyone|
|    Infinite Painter|ART_AND_

In [7]:
new_df = google_play_df.select( expr('*'), (col('Rating')>3).alias('New_column') )

new_df.show(5)

+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+----------+
|                 App|      Category|Rating|Reviews|   Size| Installs|Type|Price|Content Rating|New_column|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+----------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19000.0|  10000.0|Free|  0.0|      Everyone|      true|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|14000.0| 500000.0|Free|  0.0|      Everyone|      true|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|    8.7|5000000.0|Free|  0.0|      Everyone|      true|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|25000.0|    5.0E7|Free|  0.0|          Teen|      true|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|    2.8| 100000.0|Free|  0.0|      Everyone|      true|
+--------------------+--------------+------+-------+-------+---------+----+-----+--------------+----------+
only showing top 5 rows



In [25]:
#Nos quedamos solo con las filas con valoracion mayor a 4.2
more_than_42_df = google_play_df.filter( expr('Rating>=4.2') )
#Elegimos solo las columnas pedidas
new_df1 = more_than_42_df.select('App', 'Category')
new_df1.show(5)

+--------------------+--------------+
|                 App|      Category|
+--------------------+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|
|Sketch - Draw & P...|ART_AND_DESIGN|
|Pixel Draw - Numb...|ART_AND_DESIGN|
|Paper flowers ins...|ART_AND_DESIGN|
|Garden Coloring Book|ART_AND_DESIGN|
+--------------------+--------------+
only showing top 5 rows



In [9]:
#Filtramos las filas con el tamaño pedido
size_df=google_play_df.filter( expr('Size>10000') )
#Contamos el numero de filas
numero_apps=size_df.count() 
numero_apps

5085

In [10]:
#Eliminamos las filas convalores nulos de las columnas Type y Category
no_null_df = google_play_df.dropna('any', subset=['Type', 'Category'])
#Nos quedamos solo con las columnas Type, Category, Content Rating y Rating
new_df2 = no_null_df.select('Type', 'Category','Content Rating','Rating')
#Obtenemos la media para Type
new_df2.groupBy(col('Type')).agg(expr('mean(Rating)')).show()
#Obtenemos la media para Category
new_df2.groupBy(col('Category')).agg(expr('mean(Rating)')).show()
#Obtenemos la media para Content Rating
new_df2.groupBy(col('Content Rating')).agg(expr('mean(Rating)')).show()

+----+-----------------+
|Type|     mean(Rating)|
+----+-----------------+
|Free|4.186202546163562|
|Paid|4.266615146831529|
+----+-----------------+

+-------------------+------------------+
|           Category|      mean(Rating)|
+-------------------+------------------+
|             EVENTS| 4.435555555555557|
|             COMICS| 4.155172413793104|
|             SPORTS| 4.223510971786835|
|            WEATHER| 4.243999999999999|
|      VIDEO_PLAYERS| 4.063750000000001|
|  AUTO_AND_VEHICLES|  4.19041095890411|
|          PARENTING| 4.300000000000001|
|      ENTERTAINMENT| 4.126174496644294|
|    PERSONALIZATION| 4.335987261146501|
| HEALTH_AND_FITNESS|4.2771043771043775|
|   TRAVEL_AND_LOCAL|  4.10929203539823|
|BOOKS_AND_REFERENCE| 4.346067415730338|
|     FOOD_AND_DRINK|4.1669724770642205|
|        PHOTOGRAPHY| 4.192113564668767|
|           BUSINESS| 4.121452145214522|
|             FAMILY| 4.192272467086437|
|           SHOPPING| 4.259663865546221|
|     HOUSE_AND_HOME| 4.19736

In [11]:
#Eliminamos valores nulos de Type
no_null_df = google_play_df.dropna('any', subset=['Type'])
#Creamos la tabla de contingencia
no_null_df.crosstab('Content Rating', 'Type').show(n=100)

+-------------------+----+----+
|Content Rating_Type|Free|Paid|
+-------------------+----+----+
|            Unrated|   2|   0|
|         Mature 17+| 479|  20|
|       Everyone 10+| 380|  33|
|    Adults only 18+|   3|   0|
|           Everyone|8020| 695|
|               Teen|1156|  52|
+-------------------+----+----+



In [12]:
#Eliminamos las filas convalores nulos de las columnas Type y Category
no_null_df = google_play_df.dropna('any', subset=['Type', 'Category'])
#Medias para cada categoría
medias_df=no_null_df.groupBy(col('Category')).agg(expr('mean(Rating)'))
#Unimos dos datafreams para obtener uno del tipo App, Categoty, Rating, Category, mean(category)
#Es decir,tenemos en el mismo dataframe cada aplicación con su categoría y su valoracion y la valoracion media 
#de la categoría
no_null_df=no_null_df.select('App', 'Category','Rating')
Rating_with_category_df = no_null_df.join( medias_df, no_null_df.Category == medias_df.Category, how='inner' )
Rating_with_category_df.show()
# Eliminamos las filas que no cumplan la condicion
Condicion_df=Rating_with_category_df.filter(col('Rating')>col('mean(Rating)'))
#Nos quedamos solo con el nombre de las Apps
Best_app_df=Condicion_df.select('App').distinct()
Best_app_df.show()

+--------------------+--------------+------+--------------+-----------------+
|                 App|      Category|Rating|      Category|     mean(Rating)|
+--------------------+--------------+------+--------------+-----------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|ART_AND_DESIGN|4.358064516129031|
| Coloring book moana|ART_AND_DESIGN|   3.9|ART_AND_DESIGN|4.358064516129031|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|ART_AND_DESIGN|4.358064516129031|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5|ART_AND_DESIGN|4.358064516129031|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|ART_AND_DESIGN|4.358064516129031|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|ART_AND_DESIGN|4.358064516129031|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|ART_AND_DESIGN|4.358064516129031|
|    Infinite Painter|ART_AND_DESIGN|   4.1|ART_AND_DESIGN|4.358064516129031|
|Garden Coloring Book|ART_AND_DESIGN|   4.4|ART_AND_DESIGN|4.358064516129031|
|Kids Paint Free -...|ART_AND_DESIGN|   4.7|ART_AND_DESIGN|4.358

In [13]:
#Nos quedamos con las columnas que necesitamos y unimos
App_reviews_df=google_play_df.select('App','Reviews','Category')
App_sentiment_df=app_reviews_df.select('App','Sentiment_Polarity','Translated Review')
Reviews_sentiment_df = App_reviews_df.join( App_sentiment_df, App_reviews_df.App == App_sentiment_df.App, how='inner' ).distinct()
#Imponemos las condiciones
final_df=Reviews_sentiment_df.filter((col('Reviews')>300) & (col('Sentiment_Polarity')==1))
#Seleccionamos las columnas pedidas y mostramos
final_df.drop(google_play_df.App).select('App','Category').show()


+--------------------+-------------------+
|                 App|           Category|
+--------------------+-------------------+
|  All Maths Formulas|BOOKS_AND_REFERENCE|
|           DC Comics|             COMICS|
|Freeme Launcher—S...|    PERSONALIZATION|
|Gold Butterfly Ke...|     HOUSE_AND_HOME|
|Cougar Dating Lif...|             DATING|
|GasBuddy: Find Ch...|   TRAVEL_AND_LOCAL|
|     Golden Launcher|    PERSONALIZATION|
|Facebook Ads Manager|           BUSINESS|
|Full Screen Calle...|      COMMUNICATION|
|AlReader -any tex...|BOOKS_AND_REFERENCE|
|AutoCAD - DWG Vie...|       PRODUCTIVITY|
|    Cookbook Recipes|     FOOD_AND_DRINK|
|Cymera Camera- Ph...|        PHOTOGRAPHY|
|APUS Launcher - T...|    PERSONALIZATION|
|     Amazon Shopping|           SHOPPING|
|Current debit car...|            FINANCE|
|Cymera Camera- Ph...|        PHOTOGRAPHY|
|          DreamTrips|   TRAVEL_AND_LOCAL|
|          BBVA Spain|            FINANCE|
|EyeEm - Camera & ...|        PHOTOGRAPHY|
+----------

In [14]:
#Usamos lower para poner la columna Translated Review en minusculas
lower_df=app_reviews_df.withColumn('Translated Review', lower(col('Translated Review')))
#Usamos inner para detectar coincidencias y añadimos la columna al dataframe
match_df=lower_df.select('*',instr(col('Translated Review'), 'great').alias('Match'))
#Filtramos con la condición de que el indicador de la coincidencia sea mayor que 0
filer_match_df=match_df.filter(col('Match')>0)
#Nos quedamos solo con la columna del nombre de la App y eleminamos duplicados
filer_match_df.select('App').distinct().show()



+--------------------+
|                 App|
+--------------------+
|BaBe+ - Berita In...|
|    Basketball Stars|
|Davis's Drug Guid...|
|Find&Save - Local...|
|  Floor Plan Creator|
|     Candy Pop Story|
|Cricbuzz - Live C...|
|        Google Earth|
|    Homework Planner|
|      Epocrates Plus|
|            Arrow.io|
|CALCU™ Stylish Ca...|
|FilterGrid - Cam&...|
|Golf GPS Rangefin...|
|BELONG Beating Ca...|
|      Bubble Shooter|
|Football Live Scores|
|     Amazon Shopping|
|CM Launcher 3D - ...|
|Dream League Socc...|
+--------------------+
only showing top 20 rows



In [15]:
#Usamos corr para obtener la correlacion
print('La correlación es ')
print(google_play_df.corr('Size', 'Rating'))

La correlación es 
0.10658246952806993


In [16]:
#Unimos los dos dataframes por los nombres de las app
polarity_category_df = google_play_df.join( app_reviews_df, google_play_df.App == app_reviews_df.App, how='inner').distinct()
#Nos quedamos solo con las columnas necesarias
final_df=polarity_category_df.select('Content Rating','Sentiment_polarity')
#Calculamos la media
final_df.groupBy(col('Content Rating')).agg(expr('mean(Sentiment_polarity)')).show()
#Calculamos desviación
final_df.groupBy(col('Content Rating')).agg(expr('var_pop(Sentiment_polarity)')).show()

+---------------+------------------------+
| Content Rating|mean(Sentiment_polarity)|
+---------------+------------------------+
|           Teen|     0.12740170930175537|
|     Mature 17+|     0.17390891504000114|
|   Everyone 10+|      0.1185116527772904|
|       Everyone|     0.17736611297187863|
|Adults only 18+|      0.2984002267573696|
+---------------+------------------------+

+---------------+---------------------------+
| Content Rating|var_pop(Sentiment_polarity)|
+---------------+---------------------------+
|           Teen|        0.10291570290562735|
|     Mature 17+|         0.1315846892476854|
|   Everyone 10+|        0.08668752005123831|
|       Everyone|         0.1207805378474352|
|Adults only 18+|        0.10515158989052915|
+---------------+---------------------------+



In [40]:
spark.sql('CREATE DATABASE pydb')
spark.sql('USE pydb')

AnalysisException: 'org.apache.hadoop.hive.metastore.api.AlreadyExistsException: Database pydb already exists;'

In [22]:
#Cambiamos el nombre de la columna para que no de problemas y guardamos
google_play_df.withColumnRenamed('Content Rating', 'Content_Rating').write.saveAsTable('google')

In [23]:
#Lo mismo con este otro dataframe
app_reviews_df.withColumnRenamed('Translated Review', 'Translated_Review').write.saveAsTable('App_reviews')

In [30]:
#Realizamos la consulta con las columnas y condicion pedida y la mostramos por pantalla
google_df=spark.sql('SELECT App, Category FROM google WHERE Rating>=4.2').show()

+--------------------+--------------+
|                 App|      Category|
+--------------------+--------------+
|U Launcher Lite –...|ART_AND_DESIGN|
|Sketch - Draw & P...|ART_AND_DESIGN|
|Pixel Draw - Numb...|ART_AND_DESIGN|
|Paper flowers ins...|ART_AND_DESIGN|
|Garden Coloring Book|ART_AND_DESIGN|
|Kids Paint Free -...|ART_AND_DESIGN|
|Text on Photo - F...|ART_AND_DESIGN|
|Name Art Photo Ed...|ART_AND_DESIGN|
|Tattoo Name On My...|ART_AND_DESIGN|
|Mandala Coloring ...|ART_AND_DESIGN|
|3D Color Pixel by...|ART_AND_DESIGN|
|Photo Designer - ...|ART_AND_DESIGN|
|350 Diy Room Deco...|ART_AND_DESIGN|
|FlipaClip - Carto...|ART_AND_DESIGN|
|        ibis Paint X|ART_AND_DESIGN|
|Superheroes Wallp...|ART_AND_DESIGN|
|HD Mickey Minnie ...|ART_AND_DESIGN|
|Harley Quinn wall...|ART_AND_DESIGN|
|Colorfit - Drawin...|ART_AND_DESIGN|
|Pink Silver Bow K...|ART_AND_DESIGN|
+--------------------+--------------+
only showing top 20 rows



In [42]:
google_df=spark.sql('SELECT App, Category FROM google WHERE Rating>=4.2')
no_null_df = spark.sql('SELECT Type, Category, Content_Rating, Rating FROM google').dropna('any', subset=['Type', 'Category']).show()
#Nos quedamos solo con las columnas Type, Category, Content Rating y Rating
new_df2 = no_null_df.select('Type', 'Category','Content_Rating','Rating')
#Obtenemos la media para Type
new_df2.groupBy(col('Type')).agg(expr('mean(Rating)')).show()
#Obtenemos la media para Category
new_df2.groupBy(col('Category')).agg(expr('mean(Rating)')).show()
#Obtenemos la media para Content Rating
new_df2.groupBy(col('Content Rating')).agg(expr('mean(Rating)')).show()

+----+--------------+--------------+------+
|Type|      Category|Content_Rating|Rating|
+----+--------------+--------------+------+
|Free|ART_AND_DESIGN|      Everyone|   4.1|
|Free|ART_AND_DESIGN|      Everyone|   3.9|
|Free|ART_AND_DESIGN|      Everyone|   4.7|
|Free|ART_AND_DESIGN|          Teen|   4.5|
|Free|ART_AND_DESIGN|      Everyone|   4.3|
|Free|ART_AND_DESIGN|      Everyone|   4.4|
|Free|ART_AND_DESIGN|      Everyone|   3.8|
|Free|ART_AND_DESIGN|      Everyone|   4.1|
|Free|ART_AND_DESIGN|      Everyone|   4.4|
|Free|ART_AND_DESIGN|      Everyone|   4.7|
|Free|ART_AND_DESIGN|      Everyone|   4.4|
|Free|ART_AND_DESIGN|      Everyone|   4.4|
|Free|ART_AND_DESIGN|          Teen|   4.2|
|Free|ART_AND_DESIGN|      Everyone|   4.6|
|Free|ART_AND_DESIGN|      Everyone|   4.4|
|Free|ART_AND_DESIGN|      Everyone|   3.2|
|Free|ART_AND_DESIGN|      Everyone|   4.7|
|Free|ART_AND_DESIGN|      Everyone|   4.5|
|Free|ART_AND_DESIGN|      Everyone|   4.3|
|Free|ART_AND_DESIGN|      Every

AttributeError: 'NoneType' object has no attribute 'select'

In [None]:
#Unimos los dos dataframes por los nombres de las app
polarity_category_df = google_play_df.join( app_reviews_df, google_play_df.App == app_reviews_df.App, how='inner').distinct()
#Nos quedamos solo con las columnas necesarias
final_df=polarity_category_df.select('Content Rating','Sentiment_polarity')
#Calculamos la media
final_df.groupBy(col('Content Rating')).agg(expr('mean(Sentiment_polarity)')).show()
#Calculamos desviación
final_df.groupBy(col('Content Rating')).agg(expr('var_pop(Sentiment_polarity)')).show(