In [40]:
from lib import check_libs_installed, get_processing_folder_path

check_libs_installed()

All Required Installed


In [41]:
import pandas as pd
import numpy as np
from const import KB,MB,GB

In [42]:
df = pd.read_csv(get_processing_folder_path('processed_na.csv'))

In [43]:
df['Size Approximate'] = pd.NA
df.head(3)

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,Currency,Size,Released,Last Updated,Content Rating,Ad Supported,In App Purchases,Editors Choice,Size Approximate
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0,10+,10,15,True,0.0,USD,10M,2020-02-26,2020-02-26,Everyone,False,False,False,
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64,"5,000+",5000,7662,True,0.0,USD,2.9M,2020-05-21,2021-05-06,Everyone,True,False,False,
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0,50+,50,58,True,0.0,USD,3.7M,2019-08-09,2019-08-19,Everyone,False,False,False,


In [44]:
df['Size'].unique()

array(['10M', '2.9M', '3.7M', ..., '405M', '3.2k', '512M'], dtype=object)

In [45]:
df.loc[:,'Size'] = df['Size'].str.lower()

In [46]:
def size_to_number(s:pd.Series) -> pd.Series:
    if s['Size'].startswith('varies with device'):
        s['Size Approximate'] = 0 # for consistence
    else:
        unit = KB if s['Size'].endswith('k') else MB if s['Size'].endswith('m') else GB if s['Size'].endswith('g') else False
        if unit:
            n = ''.join(s['Size'][:-1].split(','))
            s['Size Approximate'] = int(unit * float(n))
    return s

In [47]:
df_size_to_float = df.apply(axis=1,func=size_to_number)

In [48]:
test = df_size_to_float[['Size','Size Approximate']]
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241663 entries, 0 to 2241662
Data columns (total 2 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   Size              object
 1   Size Approximate  int64 
dtypes: int64(1), object(1)
memory usage: 34.2+ MB


In [49]:
test['Size Approximate'].unique()

array([ 10485760,   3040870,   3879731, ..., 424673280,      3276,
       536870912], dtype=int64)

In [50]:
test[test['Size Approximate'].isna()]

Unnamed: 0,Size,Size Approximate


In [51]:
test['Size Approximate'].max()

1610612736

In [52]:
test[test['Size Approximate'] == 0].count()

Size                51237
Size Approximate    51237
dtype: int64

In [53]:
test['Size Approximate'].max()

# 1MB, 5MB, 50MB, 100MB, 200MB, 500MB, 1GB, 2GB, MORE THAN 2GB
bins = [
    -1,
    0,
    1*MB,
    5*MB,
    25*MB,
    50*MB,
    100*MB,
    200*MB,
    500*MB,
    1*GB,
    GB*GB
    ]

labels = [
    'Varies with device',
    '<= 1MB',
    '<= 5MB',
    '<= 25MB',
    '<= 50MB',
    '<= 100MB',
    '<= 200MB',
    '<= 500MB',
    '<= 1GB', 
    '> 1GB'
    ]

print(bins.__len__())
print(labels.__len__())

11
10


In [54]:
size_in_bin = pd.cut(test['Size Approximate'],bins=bins,labels=labels)
size_in_bin.name = 'Size Bin'

df_size_bind = pd.concat(objs=[df_size_to_float,size_in_bin],axis=1)

In [55]:
df['Category'].unique()

array(['Adventure', 'Tools', 'Productivity', 'Communication', 'Social',
       'Libraries & Demo', 'Lifestyle', 'Personalization', 'Racing',
       'Maps & Navigation', 'Travel & Local', 'Food & Drink',
       'Books & Reference', 'Medical', 'Puzzle', 'Entertainment',
       'Arcade', 'Auto & Vehicles', 'Photography', 'Health & Fitness',
       'Education', 'Shopping', 'Board', 'Music & Audio', 'Sports',
       'Beauty', 'Business', 'Educational', 'Finance', 'News & Magazines',
       'Casual', 'Art & Design', 'House & Home', 'Card', 'Events',
       'Trivia', 'Weather', 'Strategy', 'Word', 'Video Players & Editors',
       'Action', 'Simulation', 'Music', 'Dating', 'Role Playing',
       'Casino', 'Comics', 'Parenting'], dtype=object)

Game - Adventure, Racing, Puzzle, Arcade, Strategy, Action, Simulation
Tools - Tools, Productivity

In [56]:
df[df['Category'].str.match('Sports')]['App Name']

48                       Bowling Paradise Pro FREE
49                             ACtrainingLanzarote
91                               Pocket Bowling 3D
129                                        VfL1910
135                        Clube Atlético Ypiranga
                            ...                   
2241541    Dunk Basket : Basketball Dunk Hoop Game
2241551                    Spotlight Dance Academy
2241559                           H.K Golf Manager
2241562                                  La Molina
2241647                      Gear Ratio Calculator
Name: App Name, Length: 45811, dtype: object

In [57]:
df_size_bind['Size Bin'].value_counts()

<= 25MB               1095445
<= 5MB                 527873
<= 50MB                338465
<= 100MB               163949
Varies with device      51237
<= 1MB                  37539
<= 200MB                25519
<= 500MB                 1509
<= 1GB                    117
> 1GB                      10
Name: Size Bin, dtype: int64

In [58]:
df_size_bind

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,Currency,Size,Released,Last Updated,Content Rating,Ad Supported,In App Purchases,Editors Choice,Size Approximate,Size Bin
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0,10+,10,15,True,0.0,USD,10m,2020-02-26,2020-02-26,Everyone,False,False,False,10485760,<= 25MB
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64,"5,000+",5000,7662,True,0.0,USD,2.9m,2020-05-21,2021-05-06,Everyone,True,False,False,3040870,<= 5MB
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0,50+,50,58,True,0.0,USD,3.7m,2019-08-09,2019-08-19,Everyone,False,False,False,3879731,<= 5MB
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5,10+,10,19,True,0.0,USD,1.8m,2018-09-10,2018-10-13,Everyone,True,False,False,1887436,<= 5MB
4,GROW.me,com.horodyski.grower,Tools,0.0,0,100+,100,478,True,0.0,USD,6.2m,2020-02-21,2018-11-12,Everyone,False,False,False,6501171,<= 25MB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2241658,Lero TOEFL Recorder + Timer,com.toefltimer,Education,3.4,17,"1,000+",1000,1980,True,0.0,USD,10m,2018-05-22,2018-12-14,Everyone,True,False,False,10485760,<= 25MB
2241659,ORU Online,com.threedream.oruonline,Education,0.0,0,100+,100,430,True,0.0,USD,44m,2018-01-17,2018-02-02,Everyone,False,False,False,46137344,<= 50MB
2241660,Data Structure,datastructure.appoworld.datastucture,Education,0.0,0,100+,100,202,True,0.0,USD,29m,2018-08-19,2018-08-19,Everyone,False,False,False,30408704,<= 50MB
2241661,Devi Suktam,ishan.devi.suktam,Music & Audio,3.5,8,"1,000+",1000,2635,True,0.0,USD,10m,2016-08-01,2021-05-05,Everyone,True,False,False,10485760,<= 25MB


In [59]:
df_size_bind.info()
df_size_bind.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241663 entries, 0 to 2241662
Data columns (total 20 columns):
 #   Column            Dtype   
---  ------            -----   
 0   App Name          object  
 1   App Id            object  
 2   Category          object  
 3   Rating            float64 
 4   Rating Count      int64   
 5   Installs          object  
 6   Minimum Installs  int64   
 7   Maximum Installs  int64   
 8   Free              bool    
 9   Price             float64 
 10  Currency          object  
 11  Size              object  
 12  Released          object  
 13  Last Updated      object  
 14  Content Rating    object  
 15  Ad Supported      bool    
 16  In App Purchases  bool    
 17  Editors Choice    bool    
 18  Size Approximate  int64   
 19  Size Bin          category
dtypes: bool(4), category(1), float64(2), int64(4), object(9)
memory usage: 267.2+ MB


Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,Currency,Size,Released,Last Updated,Content Rating,Ad Supported,In App Purchases,Editors Choice,Size Approximate,Size Bin
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0,10+,10,15,True,0.0,USD,10m,2020-02-26,2020-02-26,Everyone,False,False,False,10485760,<= 25MB
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64,"5,000+",5000,7662,True,0.0,USD,2.9m,2020-05-21,2021-05-06,Everyone,True,False,False,3040870,<= 5MB
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0,50+,50,58,True,0.0,USD,3.7m,2019-08-09,2019-08-19,Everyone,False,False,False,3879731,<= 5MB
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5,10+,10,19,True,0.0,USD,1.8m,2018-09-10,2018-10-13,Everyone,True,False,False,1887436,<= 5MB
4,GROW.me,com.horodyski.grower,Tools,0.0,0,100+,100,478,True,0.0,USD,6.2m,2020-02-21,2018-11-12,Everyone,False,False,False,6501171,<= 25MB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2241658,Lero TOEFL Recorder + Timer,com.toefltimer,Education,3.4,17,"1,000+",1000,1980,True,0.0,USD,10m,2018-05-22,2018-12-14,Everyone,True,False,False,10485760,<= 25MB
2241659,ORU Online,com.threedream.oruonline,Education,0.0,0,100+,100,430,True,0.0,USD,44m,2018-01-17,2018-02-02,Everyone,False,False,False,46137344,<= 50MB
2241660,Data Structure,datastructure.appoworld.datastucture,Education,0.0,0,100+,100,202,True,0.0,USD,29m,2018-08-19,2018-08-19,Everyone,False,False,False,30408704,<= 50MB
2241661,Devi Suktam,ishan.devi.suktam,Music & Audio,3.5,8,"1,000+",1000,2635,True,0.0,USD,10m,2016-08-01,2021-05-05,Everyone,True,False,False,10485760,<= 25MB


In [60]:
df_size_bind.to_csv(get_processing_folder_path('processed_temp.csv'),index=False)

In [61]:
df_size_bind.isna().sum()

App Name            0
App Id              0
Category            0
Rating              0
Rating Count        0
Installs            0
Minimum Installs    0
Maximum Installs    0
Free                0
Price               0
Currency            0
Size                0
Released            0
Last Updated        0
Content Rating      0
Ad Supported        0
In App Purchases    0
Editors Choice      0
Size Approximate    0
Size Bin            0
dtype: int64