# Profitable apps for the Google Play and App Store markets
Our goal for this project is to analyze data to help developers understand what kinds of apps are likely to attract more users, since our revenue consists of in-app adds.

In [1]:
#opening files
from csv import reader

#opening android dataset
opened_file = open('googleplaystore.csv', encoding='iso-8859-1')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

#opening app store dataset
opened_file = open('AppleStore.csv',encoding='iso-8859-1')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
    if rows_and_columns:
        print('Number of rows:',len(dataset))
        print('Number of columns:',len(dataset[0]))

In [3]:
print(android_header)
print('\n')
explore_data(android,0,3,True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite â\x80\x93 FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [4]:
print(ios_header)
print('\n')
explore_data(ios,0,3,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


In [5]:
del android[10472]

Let's see if there are any duplicate apps in the googleplaystore.csv.
To do this we first create 2 empty lists named `unique_apps` and `duplicate_apps`. Then using a `for` loop, we asign the name of an app to a variable `name` and check if the name occurs in `unique_apps`. If it does, then we `append` the app to the `duplicate_apps` list. If it doesn't, we `append` it to `unique_apps`. Lastly, we check the length of the list with duplicate data.

In [6]:
unique_apps = []
duplicate_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of duplicate apps:', len(duplicate_apps))

Number of duplicate apps: 1181


As we can see, there are 1181 cases of duplicate data. Let's take Instagram for example.

In [7]:
for app in android:
    if app[0] == 'Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


If we examine the rows printed above, we can see, that the difference happens in the 4th row corresponding to 'ratings'. It's probably because the data was collected in different time. We can make it a criteria for removing rows. Instead of removing them randomly, we can keep the row with the highest number of reviews, because the higher the number, the later the data was collected. 

In [8]:
reviews_max = {}

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In a previous cell, there were 1181 duplicate apps. The number of unique apps should be the difference between all aps and duplicates.

In [9]:
print('expected length:',len(android)-1181) #length of all aps - duplicates = number of unique apps
print('actual lenght:',len(reviews_max)) #number of unique apps

expected length: 9659
actual lenght: 9659


Now, I use the `reviews_max` dictionary to make a list of unique android apps. I'll only keep the records with the highest number of reviews.

In [10]:
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

Now let's confirm, there are 9659 records.

In [11]:
explore_data(android_clean, 0, 3, True) 

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite â\x80\x93 FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


# Removing non-english apps

If we explore the data sets, we can see, there are apps not directed towards english-speaking audience. Below are some examples:

In [12]:
print(ios[813][1])
print(ios[6731][1])

print(android_clean[4412][0])
print(android_clean[7940][0])

ç±å¥èºPPS -ãæ¬¢ä¹é¢2ãçµè§å§ç­æ­
ãè±åºã²ã¼ã ãçµ¶å¯¾ã«æå¾ã¾ã§ãã¬ã¤ããªãã§ ãè¬è§£ãï¼ãã­ãã¯ããºã«ã
ä¸­å½èª AQãªã¹ãã³ã°
ÙØ¹Ø¨Ø© ØªÙØ¯Ø± ØªØ±Ø¨Ø­ DZ


We are not interested in this kind of apps, so we'll remove them. All characters used in English are encoded using the ASCII standard. Each ASCII character has a corresponding number between 0 and 127. We can build a function using a build-in function `ord()` to check if the app name consists of only ASCII characters.

In [13]:
def is_english(word):
    
    for letter in word:
        if ord(letter) > 127:
            return False
    return True
print(is_english('Instachat'))
print(is_english(ios[813][1]))

True
False


The function seems to work fine, but some of English apps names contain emojis and other symbols that fall outside ASCII range.

In [14]:
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

print(ord('™'))
print(ord('😜'))

False
False
8482
128540


To avoid removing that apps, I'll make a condition, that app is only removed, when its name contains more than 3 non ASCII characters.

In [15]:
def is_english(word):
    count = 0
    
    for letter in word:
        if ord(letter) > 127:
            count += 1
    
    if count > 3:    
        return False
    else:
        return True
print(is_english('Instachat 😜'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))

True
True
False


The condition is not perfect, but only few non-english apps can get through that filter, which should be sufficient for further analysis.

Below we use the `is_english()` function to filter out the non-English apps from both data sets.

In [16]:
android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)
        
for app in ios:
    name = app[1]
    if is_english(name):
        ios_english.append(app)

print('number of English and non-English android apps:',len(android_clean),'\n'
'number of English and non-English ios apps:',len(ios))
print('number of Enlish android apps:',len(android_english))
print('number of English ios apps:',len(ios_english))        

number of English and non-English android apps: 9659 
number of English and non-English ios apps: 7197
number of Enlish android apps: 9500
number of English ios apps: 6100


# Isolating free apps

As mentioned in the introduction, we only develop apps, that are free to download and install, and our main source of revenue consists of in-app ads. Our data sets contain both free and non-free aps and we'll need to isolate the free aps for our analysis.

In [17]:
android_final = []
ios_final = []

for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)

for app in ios_english:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)

print(len(android_final))
print(len(ios_final))

8760
3169


We are left with 8760 android and 3169 ios apps.

# Most common apps  by genre



def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

display_table(ios_final, -5)

We'll start by creating 2 functions. One creating a frequency table and second one displaying the frequency table in descending order.

In [18]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

This is how the frequency table looks like for App Store `prime_genre` column

In [19]:
display_table(ios_final,-5)

Games : 58.53581571473651
Entertainment : 7.82581255916693
Photo & Video : 5.0489113284947935
Education : 3.72357210476491
Social Networking : 3.2817923635216157
Shopping : 2.5244556642473968
Utilities : 2.398232881035027
Sports : 2.1773430104133795
Music : 2.0511202272010096
Health & Fitness : 1.9880088355948247
Productivity : 1.7040075733669928
Lifestyle : 1.5462290943515304
News : 1.3253392237298833
Travel : 1.1360050489113285
Finance : 1.1044493531082362
Weather : 0.8520037866834964
Food & Drink : 0.8204480908804039
Reference : 0.5364468286525718
Business : 0.5364468286525718
Book : 0.3786683496371095
Navigation : 0.18933417481855475
Medical : 0.18933417481855475
Catalogs : 0.12622278321236985


We can see that among the free English apps, more than a half (58.16%) are games. Entertainment apps are close to 8%, followed by photo and video apps, which are close to 5%. Only 3.66% of the apps are designed for education, followed by social networking apps which amount for 3.29% of the apps in our data set.

Let's continue by examining the Genres and Category columns of the Google Play data set (two columns which seem to be related).

In [20]:
display_table(android_final, 1) # Category

FAMILY : 18.938356164383563
GAME : 9.657534246575343
TOOLS : 8.481735159817351
BUSINESS : 4.646118721461187
PRODUCTIVITY : 3.9383561643835616
LIFESTYLE : 3.9155251141552516
FINANCE : 3.721461187214612
MEDICAL : 3.550228310502283
SPORTS : 3.3333333333333335
PERSONALIZATION : 3.287671232876712
COMMUNICATION : 3.2534246575342465
HEALTH_AND_FITNESS : 3.093607305936073
PHOTOGRAPHY : 2.9794520547945202
NEWS_AND_MAGAZINES : 2.808219178082192
SOCIAL : 2.6484018264840183
TRAVEL_AND_LOCAL : 2.3401826484018264
SHOPPING : 2.2488584474885847
BOOKS_AND_REFERENCE : 2.146118721461187
DATING : 1.860730593607306
VIDEO_PLAYERS : 1.8036529680365299
MAPS_AND_NAVIGATION : 1.3812785388127853
FOOD_AND_DRINK : 1.2328767123287672
EDUCATION : 1.1757990867579908
ENTERTAINMENT : 0.9589041095890412
AUTO_AND_VEHICLES : 0.9246575342465754
LIBRARIES_AND_DEMO : 0.9018264840182649
WEATHER : 0.7876712328767124
HOUSE_AND_HOME : 0.7876712328767124
EVENTS : 0.7191780821917808
ART_AND_DESIGN : 0.6506849315068494
PARENTING : 

The results seem to be different on Google Play. The most popular category is family (18.94%), followed by game (9.66%), tools (8.48%) and business (4.64). Practical apps have much higher representation on Google Play compared to App store. This picture is also confirmed in the `genres` column

In [21]:
display_table(android_final, -4) # Genre

Tools : 8.470319634703197
Entertainment : 6.084474885844749
Education : 5.3881278538812785
Business : 4.646118721461187
Productivity : 3.9383561643835616
Lifestyle : 3.904109589041096
Finance : 3.721461187214612
Medical : 3.550228310502283
Sports : 3.4018264840182644
Personalization : 3.287671232876712
Communication : 3.2534246575342465
Action : 3.105022831050228
Health & Fitness : 3.093607305936073
Photography : 2.9794520547945202
News & Magazines : 2.808219178082192
Social : 2.6484018264840183
Travel & Local : 2.328767123287671
Shopping : 2.2488584474885847
Books & Reference : 2.146118721461187
Simulation : 2.054794520547945
Dating : 1.860730593607306
Arcade : 1.82648401826484
Video Players & Editors : 1.7808219178082192
Casual : 1.7351598173515983
Maps & Navigation : 1.3812785388127853
Food & Drink : 1.2328767123287672
Puzzle : 1.141552511415525
Racing : 1.004566210045662
Role Playing : 0.9474885844748858
Strategy : 0.9246575342465754
Auto & Vehicles : 0.9246575342465754
Libraries &

For further analysis I'll use only the *category* column, because it has less categories and gives a bigger picture.

One way to find out which genre is the most popular is to calculate the average number of installs per app in category. In Google Play dataset we can find it in *Installs* column. For the App Store dataset, this column is missing, so I'll use the number of ratings as an evaluation.

# Most popular apps by genre on App Store

Below, we calculate the average number of user ratings per app genre on the App Store:

In [22]:
genres_ios = freq_table(ios_final,-5)

for genre in genres_ios:
    len_genre = 0
    total = 0
    for app in ios_final:
        genre_app = app[-5]
        if genre_app == genre:
            total += float(app[5])
            len_genre += 1
    avg_ratings = total / len_genre
    print(genre,':',avg_ratings)

Social Networking : 72916.54807692308
Photo & Video : 28441.54375
Games : 22985.211320754715
Music : 58205.03076923077
Reference : 79350.4705882353
Health & Fitness : 24037.634920634922
Weather : 54215.2962962963
Utilities : 19900.473684210527
Travel : 31358.5
Shopping : 27816.2
News : 21750.071428571428
Navigation : 86090.33333333333
Lifestyle : 16739.34693877551
Entertainment : 14364.774193548386
Food & Drink : 33333.92307692308
Sports : 23008.898550724636
Book : 46384.916666666664
Finance : 32367.02857142857
Education : 7003.983050847458
Productivity : 21799.14814814815
Business : 7491.117647058823
Catalogs : 4004.0
Medical : 612.0


# Most popular apps by genre on Google Play

For the Google Play dataset we got actual information about number of installs, but the values are open ended and don't seem to be precise

In [23]:
display_table(android_final, 5)

1,000,000+ : 15.74200913242009
100,000+ : 11.518264840182649
10,000,000+ : 10.60502283105023
10,000+ : 10.205479452054794
1,000+ : 8.367579908675799
100+ : 6.952054794520548
5,000,000+ : 6.872146118721462
500,000+ : 5.5479452054794525
50,000+ : 4.7716894977168955
5,000+ : 4.486301369863014
10+ : 3.515981735159817
500+ : 3.2077625570776256
50,000,000+ : 2.28310502283105
100,000,000+ : 2.134703196347032
50+ : 1.9292237442922375
5+ : 0.7876712328767124
1+ : 0.5136986301369862
500,000,000+ : 0.273972602739726
1,000,000,000+ : 0.228310502283105
0+ : 0.045662100456621
0 : 0.01141552511415525


We're going to leave the numbers as they are, which means that we'll consider that an app with 100,000+ installs has 100,000 installs, and an app with 1,000,000+ installs has 1,000,000 installs, and so on.
To do the computations we'll need to convert each number to float.

In [24]:
categories_android = freq_table(android_final,1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            len_category += 1
            installs = app[5]
            installs = installs.replace("+","")
            installs = installs.replace(",","")
            total += float(installs)
    avg_installs = total / len_category
    print(category, ':', avg_installs)
            

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 654074.8271604938
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8329168.936170213
BUSINESS : 1712290.1474201474
COMICS : 859042.1568627451
COMMUNICATION : 38550548.03859649
DATING : 861409.5521472392
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11767380.952380951
EVENTS : 253542.22222222222
FINANCE : 1365500.4049079753
FOOD_AND_DRINK : 1951283.8055555555
HEALTH_AND_FITNESS : 4219697.055350553
HOUSE_AND_HOME : 1385541.463768116
LIBRARIES_AND_DEMO : 649314.0506329114
LIFESTYLE : 1447458.976676385
GAME : 15571586.690307328
FAMILY : 3716053.755274262
MEDICAL : 121161.87781350482
SOCIAL : 23628689.23275862
SHOPPING : 7103190.78680203
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3750580.6438356163
TRAVEL_AND_LOCAL : 14120454.07804878
TOOLS : 10902378.834454913
PERSONALIZATION : 5240358.986111111
PRODUCTIVITY : 16787331.344927534
PARENTING : 552875.1785714285
WEATHER : 5212877.101449275
VIDEO_PLAYERS : 24878048.860759493
NEWS_AND_MAGAZI

On average, communication apps have the most installs: 38,456,119. This number is inflated by a few apps that have over one billion installs (WhatsApp, Facebook Messenger, Skype, Google Chrome, Gmail, and Hangouts), and a few others with over 100 and 500 million installs:

In [25]:
for app in android_final:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
imo beta free calls and text : 100,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Messenger â Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Skype - free IM & video calls : 1,000,000,000+
Who : 100,000,000+
GO SMS Pro - Messenger, Free Themes, Emoji : 100,000,000+
LINE: Free Calls & Messages : 500,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Firefox Browser fast & private : 100,000,000+
UC Browser - Fast Download Private & Secure : 500,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Messenger Lite: Free Calls & Messages : 100,000,000+
Kik : 100,000,000+
KakaoTalk: Free Calls & Text : 100,000,000+
Opera Mini - fast web browser : 100,000,000+
Opera Browser: Fast and Secure : 100,000,000+
Telegram : 100,000,000+
Truecaller: Caller ID, SMS spam blocking & Dialer : 100,000,000+
UC Browser Mini -Tiny Fast Private & Secure : 100,000,000+
Viber Me

If we remove all the communication apps that have over 100 million installs, the average is reduced

In [26]:
under_100_m = []

for app in android_final:
    n_installs = app[5]
    n_installs = n_installs.replace(',', '')
    n_installs = n_installs.replace('+', '')
    if (app[1] == 'COMMUNICATION') and (float(n_installs) < 100000000):
        under_100_m.append(float(n_installs))
        
sum(under_100_m) / len(under_100_m)

3437620.895348837