In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
import string

In [2]:
sns.set_theme(
    style="whitegrid",
    palette="deep",
    font="sans-serif",
    rc={
        "axes.titlesize": 16,
        "axes.labelsize": 14,
        "xtick.labelsize": 9,
        "ytick.labelsize": 9,
        "legend.fontsize": 12,
        "figure.figsize": (12, 6),
        "axes.grid": True,
        "grid.linestyle": "--",
        "grid.alpha": 0.7,
    }
)

In [3]:
googleplaystore_df = pd.read_csv('db/googleplaystore.csv')
googleplaystore_reviews_df = pd.read_csv('db/googleplaystore_user_reviews.csv')

## Entendendo a Base de Dados

### googleplaystore_df

In [4]:
googleplaystore_df.shape

(10841, 13)

In [5]:
googleplaystore_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [6]:
googleplaystore_df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [7]:
googleplaystore_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [8]:
googleplaystore_df.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,9367.0,10841.0,10841,10841,10840,10841.0,10840,10841,10841,10833,10838
unique,9660,34,,6002.0,462,22,3,93.0,6,120,1378,2832,33
top,ROBLOX,FAMILY,,0.0,Varies with device,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,,596.0,1695,1579,10039,10040.0,8714,842,326,1459,2451
mean,,,4.193338,,,,,,,,,,
std,,,0.537431,,,,,,,,,,
min,,,1.0,,,,,,,,,,
25%,,,4.0,,,,,,,,,,
50%,,,4.3,,,,,,,,,,
75%,,,4.5,,,,,,,,,,


### googleplaystore_user_reviews

In [9]:
googleplaystore_reviews_df.shape

(64295, 5)

In [10]:
googleplaystore_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [11]:
googleplaystore_reviews_df.isnull().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [12]:
googleplaystore_reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [13]:
googleplaystore_reviews_df.describe(include='all')

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
count,64295,37427,37432,37432.0,37432.0
unique,1074,27994,3,,
top,Bowmasters,Good,Positive,,
freq,320,247,23998,,
mean,,,,0.182146,0.492704
std,,,,0.351301,0.259949
min,,,,-1.0,0.0
25%,,,,0.0,0.357143
50%,,,,0.15,0.514286
75%,,,,0.4,0.65


## Limpesa e Transformação

In [14]:
mean_rating = googleplaystore_df['Rating'].mean()
googleplaystore_df['Rating'] = googleplaystore_df['Rating'].fillna(mean_rating).infer_objects(copy=False)

googleplaystore_df = googleplaystore_df.dropna(subset=['Type', 'Content Rating', 'Android Ver'])

In [15]:
googleplaystore_df['Reviews'] = googleplaystore_df['Reviews'].astype(int)
googleplaystore_df['Installs'] = googleplaystore_df['Installs'].str.replace(",", "").str.replace("+", "").astype(int)
googleplaystore_df['Price'] = googleplaystore_df['Price'].str.replace("$", "").astype(float)

In [16]:
def size_to_mb(size):
    if "M" in size:
        return float(size.replace("M", ""))
    elif "k" in size:
        return float(size.replace("k", "")) / 1024
    else:
        return np.nan

googleplaystore_df['Size'] = googleplaystore_df['Size'].apply(size_to_mb)

In [17]:
googleplaystore_df['Last Updated'] = pd.to_datetime(googleplaystore_df['Last Updated'])

In [18]:
googleplaystore_reviews_df.dropna(subset=['Translated_Review'], how='all', inplace=True)

googleplaystore_reviews_df = googleplaystore_reviews_df.assign(
    Sentiment=googleplaystore_reviews_df["Sentiment"].fillna("Neutral"),
    Sentiment_Polarity=googleplaystore_reviews_df["Sentiment_Polarity"].fillna(0),
    Sentiment_Subjectivity=googleplaystore_reviews_df["Sentiment_Subjectivity"].fillna(0)
)

## Respondendo Perguntas - Base de dados 1: Detalhes dos Aplicativos

### Qual é a distribuição média de avaliações (rating) por categoria?

### Quais são os aplicativos gratuitos mais bem avaliados ?


### Quais gêneros possuem os maiores números de instalações?


### Qual é a distribuição de avaliações por faixa etária (Content Rating)?

### Quais são os aplicativos pagos mais caros e suas categorias?

### Qual é a relação entre o número de avaliações e o rating por categoria ?

### Quais são as versões mínimas do Android mais exigidas pelos aplicativos mais populares?

## Respondendo Perguntas - Base de dados 2: Análise de Sentimento

### 1. Qual é a distribuição de sentimentos (positivo, neutro, negativo) por aplicativo?

### 2. Aplicativos com maior proporção de reviews positivas entre os mais baixados

### 3. Palavras mais frequentes em reviews negativas e positivas

### 4. Variação da subjetividade média entre aplicativos

### 5. Distribuição de Sentimentos nos Aplicativos com Maiores Ratings

### Apps com maior polaridade positiva e negativa

## Respondendo Perguntas - Ambas as Bases de Dados

### 1. Os aplicativos mais populares (por número de instalações) possuem reviews predominantemente positivos?

### 2. Quais aplicativos gratuitos possuem maior polaridade positiva nos reviews?

### 3. Existe alguma relação entre o tamanho do aplicativo e os sentimentos dos reviews?

### 4. Quais categorias possuem maior número de reviews subjetivas?

### 5. Os aplicativos pagos têm sentimentos mais positivos do que os gratuitos?