## Анализ данных из данные из Google Play Store

**Описание данных**

- `App` — название приложения

- `Category` — категория, к которой относится приложение

- `Rating` — рейтинг пользователей

- `Reviews` — количество отзывов пользователей о приложении

- `Size` — размер приложения

- `Installs` — количество загрузок/установок приложения пользователями

- `Type` — платное или бесплатное приложение

- `Price` — цена приложения

- `Content Rating` — возрастная группа, на которую ориентировано приложение

- `Genres` — принадлежность приложения к нескольким жанрам

- `Last Updated` — дата последнего обновления приложения в Play Store

- Current Ver — текущая версия приложения в Play Store

- Android Ver — минимальная требуемая версия Android

In [257]:
import pandas as pd

In [258]:
df = pd.read_csv('playstore.csv', index_col=0)

In [259]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Исследуем и подготавливаем данные

In [260]:
# переименуем название колонок на названия без пробелов для удобства

In [261]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [262]:
df.head(2)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [263]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10840 entries, 0 to 10839
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app             10840 non-null  object 
 1   category        10840 non-null  object 
 2   rating          9366 non-null   float64
 3   reviews         10840 non-null  int64  
 4   size            10840 non-null  object 
 5   installs        10840 non-null  object 
 6   type            10839 non-null  object 
 7   price           10840 non-null  object 
 8   content_rating  10840 non-null  object 
 9   genres          10840 non-null  object 
 10  last_updated    10840 non-null  object 
 11  current_ver     10832 non-null  object 
 12  android_ver     10838 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.2+ MB


In [264]:
# проверим на отсутсвующие значения
df.isna().sum()

app                  0
category             0
rating            1474
reviews              0
size                 0
installs             0
type                 1
price                0
content_rating       0
genres               0
last_updated         0
current_ver          8
android_ver          2
dtype: int64

In [265]:
df.shape

(10840, 13)

In [266]:
df.dtypes

app                object
category           object
rating            float64
reviews             int64
size               object
installs           object
type               object
price              object
content_rating     object
genres             object
last_updated       object
current_ver        object
android_ver        object
dtype: object

In [267]:
# приведем даты к формату datetime

In [268]:
df.last_updated = pd.to_datetime(df.last_updated)

In [269]:
# приведем колонку price к типу float
df.price = df.price.str.replace("$", "").astype(float)

In [270]:
df['price'].value_counts()

price
0.00     10040
0.99       148
2.99       129
1.99        73
4.99        72
         ...  
19.90        1
1.75         1
14.00        1
4.85         1
1.04         1
Name: count, Length: 92, dtype: int64

In [271]:

df.installs.unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0'], dtype=object)

In [272]:
# приведем колонку installs к типу int
df.installs = df.installs.str.replace("+", "").str.replace(",", "").astype(int)

In [273]:
df.dtypes

app                       object
category                  object
rating                   float64
reviews                    int64
size                      object
installs                   int64
type                      object
price                    float64
content_rating            object
genres                    object
last_updated      datetime64[ns]
current_ver               object
android_ver               object
dtype: object

In [274]:
df.describe()

Unnamed: 0,rating,reviews,installs,price,last_updated
count,9366.0,10840.0,10840.0,10840.0,10840
mean,4.191757,444152.9,15464340.0,1.027368,2017-11-21 06:43:02.435424256
min,1.0,0.0,0.0,0.0,2010-05-21 00:00:00
25%,4.0,38.0,1000.0,0.0,2017-09-20 00:00:00
50%,4.3,2094.0,100000.0,0.0,2018-05-24 00:00:00
75%,4.5,54775.5,5000000.0,0.0,2018-07-20 00:00:00
max,5.0,78158310.0,1000000000.0,400.0,2018-08-08 00:00:00
std,0.515219,2927761.0,85029360.0,15.949703,


In [275]:
df.nunique()

app               9659
category            33
rating              39
reviews           6001
size               461
installs            20
type                 2
price               92
content_rating       6
genres             119
last_updated      1377
current_ver       2831
android_ver         33
dtype: int64

In [276]:
# уберем дубликаты приложений
df_unique = df.drop_duplicates(subset=['app']).reset_index()
df_unique.drop('index', inplace=True, axis=1)

In [277]:
df_unique.head()

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


### Доля бесплатных приложений

In [278]:
df_unique.price.value_counts(normalize=True)[0.00]

0.9217310280567347

### Образовательные проекты с количеством отзывов > 1000 

In [279]:
edu_app = df_unique[
    (df_unique.category == 'EDUCATION') 
    & (df_unique.reviews > 1000)] \
    .reset_index() \
    .drop('index', axis=1)

In [280]:
edu_app.head()

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Duolingo: Learn Languages Free,EDUCATION,4.7,6289924,Varies with device,100000000,Free,0.0,Everyone,Education;Education,2018-08-01,Varies with device,Varies with device
1,TED,EDUCATION,4.6,181893,18M,10000000,Free,0.0,Everyone 10+,Education,2018-07-27,3.2.5,4.1 and up
2,English Communication - Learn English for Chin...,EDUCATION,4.7,2544,18M,100000,Free,0.0,Everyone,Education,2017-12-29,3.1,4.0 and up
3,Khan Academy,EDUCATION,4.6,85375,21M,5000000,Free,0.0,Everyone,Education,2018-07-27,5.0.0,4.1 and up
4,Learn English with Wlingua,EDUCATION,4.7,314299,3.3M,10000000,Free,0.0,Everyone,Education,2018-05-02,1.94.9,4.0 and up


### Средний рейтинг, количество отзывов и стоимость в зависимости от категории

In [283]:
df_by_category = pd.pivot_table(df_unique, 
               values=['price', 'rating', 'reviews'],
               index=['category', 'type'],
               aggfunc={'price': 'mean', 'rating': 'mean', 'reviews': 'mean'}
              ).rename(columns={
                        'price': 'mean_price',
                        'rating': 'mean_rating',
                        'reviews': 'mean_reviews'})

In [284]:
df_by_category.mean_price = df_by_category.mean_price.round(2)
df_by_category.mean_rating = df_by_category.mean_rating.round(1)
df_by_category.mean_reviews = df_by_category.mean_reviews.round(2)

In [253]:
df_by_category.to_csv('task12.csv', sep=',')

In [285]:
df_by_category.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_price,mean_rating,mean_reviews
category,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ART_AND_DESIGN,Free,0.0,4.3,23230.11
ART_AND_DESIGN,Paid,1.99,4.7,722.0
AUTO_AND_VEHICLES,Free,0.0,4.2,14140.28
AUTO_AND_VEHICLES,Paid,4.49,4.6,1387.67
BEAUTY,Free,0.0,4.3,7476.23
