# Profitable App Profiles for the App Store and Google Play Markets
This project is to find mobile app profiles that are profitable for the App Store and Google Play markets. 

In [None]:
import pandas as pd
from csv import reader

# Slicing and Exploring Data

In [2]:
##slicing data

data1=open("AppleStore.csv")
data2=open("googleplaystore.csv")
applereader=reader(data1)
applelist=list(applereader)

googlereader=reader(data2)
googlelist=list(googlereader)

appleheader=applelist[0]
androidheader=googlelist[0]
ios=applelist[1:]
android=googlelist[1:]

In [3]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [4]:
explore_data(applelist,0,3,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


In [5]:
explore_data(android,2,100,True)

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


['Smoke Effect Photo Maker - Smoke Editor', 'ART_AND_DESIGN', '3.8', '178', '19M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'April 26, 2018', '1.1', '4.0.3 and up']


['Infinite Painter', 'ART_AND_DESIGN', '4.1', '36815', '29M', '1,000,000+', 'Free', '0', 'Everyone', 

# DATA CLEANING STEP 1: get rid of errors and duplicate values

In [9]:
###as this has error in its number of reviews: gives error after running through the loop while converting string to float 
del android[10472]

In [12]:
androidheader

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [13]:
####Finding Duplicate Value
##FOR ANDROID APPS:
duplicate_apps = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print('Number of duplicate android apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])


print("\n")

##FOR IOS Apps

ios_duplicate_apps = []
ios_unique_apps = []

for app in ios:
    name = app[1]
    if name in ios_unique_apps:
        ios_duplicate_apps.append(name)
    else:
        ios_unique_apps.append(name)
    
print('Number of duplicate ios apps:', len(ios_duplicate_apps))
print('\n')
print('Examples of duplicate apps:', ios_duplicate_apps[:15])


Number of duplicate android apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


Number of duplicate ios apps: 2


Examples of duplicate apps: ['Mannequin Challenge', 'VR Roller Coaster']


In [14]:
##Validating data cleaning
print("Total number of android apps",len(android))
print("Number of duplicate android apps:",len(duplicate_apps))
print("Expected number of android apps after removing duplicated",len(android)-1181)

print("\n")


print("Total number of ios apps",len(ios))
print("Number of duplicate ios apps:",len(ios_duplicate_apps))
print("Expected number of ios apps after removing duplicated",len(ios)-2)





Total number of android apps 10840
Number of duplicate android apps: 1181
Expected number of android apps after removing duplicated 9659


Total number of ios apps 7197
Number of duplicate ios apps: 2
Expected number of ios apps after removing duplicated 7195


There might be duplicate data values with same or less number of reviews,we need data with highest number of reviews and eliminate the duplicate values with less reviews:

In [15]:
##android
reviews_max = {}

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [16]:
###ios
reviews_ios_max = {}

for app in ios:
    name = app[1]
    n_reviews = float(app[5])
    
    if name in reviews_ios_max and reviews_ios_max[name] < n_reviews:
        reviews_ios_max[name] = n_reviews
        
    elif name not in reviews_ios_max:
        reviews_ios_max[name] = n_reviews


In [17]:
len(reviews_ios_max)

7195

In [18]:
android_clean_data=[]
already_added=[]

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean_data.append(app)
        already_added.append(name)

In [19]:
len(android_clean_data)

9659

In [20]:
ios_clean_data=[]
ios_already_added=[]

for app in ios:
    name = app[1]
    n_reviews = float(app[5])
    
    if (reviews_ios_max[name] == n_reviews) and (name not in ios_already_added):
        ios_clean_data.append(app)
        ios_already_added.append(name)

In [21]:
len(ios_clean_data)
###now this seems a satisfactory data cleansing 


7195

# DATA CLEANING SETP 2: Filtering out english only apps



This can be done using the ord build in function where the fuction will output the value of the ASCII number corresponding to the name.
We can eliminate non-english names by using ASCII number
i.e if the name contains other characters than that in english library it will give a value higher than 127 whic indicates its a non english word.

Also I have given the names list the liberty to have 3 Ascci characters.If the number has more than 3 of the ASCII characters then those names will be classified as non-english words


In [22]:
def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True
################################################
english_clean_android_data=[]
deldata=[]

for app in android_clean_data:
    name=app[0]
    if is_english(name)==True:
        english_clean_android_data.append(app)
    else:
        deldata.append(app)
        
print(len(english_clean_android_data))

9614


In [23]:
def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True
################################################
english_clean_ios_data=[]
deliosdata=[]

for app in ios_clean_data:
    name=app[1]
    if is_english(name)==True:
        english_clean_ios_data.append(app)
    else:
        deliosdata.append(app)
        
print(len(english_clean_ios_data))

6181


DATA CLEANING STEP 3: Lets filter the free apps for data analysis

In [24]:
#lets check the number of free apps
final_android_free=[]
final_android_paid=[]
for app in english_clean_android_data:
    typ=app[6]
    
    if typ =="Free":
        final_android_free.append(app)
    else:
        final_android_paid.append(app)

In [25]:
#lets check the number of free apps
final_ios_free=[]
final_ios_paid=[]
for app in english_clean_ios_data:
    typ=app[4]
    
    if typ =="0.0":
        final_ios_free.append(app)
    else:
        final_ios_paid.append(app)

In [26]:
print("Total number of clean android apps:",len(final_android_free))

print("Total number of clean ios apps:    ",len(final_ios_free))

Total number of clean android apps: 8863
Total number of clean ios apps:     3220


# DATA ANALYSIS

# Most Common Apps by Genre


In [31]:
def freqtable(dataset,index):
    table={}
    total=0
    
    for app in dataset:
        total +=1
        name=app[index]
    
        if name in table:
            table[name] += 1
        else:
            table[name]=1
    
    percentages_table={}
    ###finding percentages
    for app in table:
        percentages_table[app]=((table[app]/total)*100)
        #percentages_table.append(print(app," :",(table[app]/total)*100))
    return percentages_table
        
        
        
def display_table(dataset,index):
    d_table=freqtable(dataset,index)
    table_display=[]
    
    for app in d_table:
        tt=(d_table[app],app)
        table_display.append(tt)
    
    sorted_Table=sorted(table_display,reverse=True)
    
    for i in sorted_Table:
        print(i[1],": ",i[0])
    
            
        
    
    
    

In [34]:
##frequency table for android apps based on genre
display_table(final_android_free,9)

Tools :  8.450863138892023
Entertainment :  6.070179397495204
Education :  5.348076272142616
Business :  4.592124562789123
Productivity :  3.8925871601038025
Lifestyle :  3.8925871601038025
Finance :  3.7007785174320205
Medical :  3.5315355974275078
Sports :  3.463838429425702
Personalization :  3.317161232088458
Communication :  3.2381812027530184
Action :  3.102786866749408
Health & Fitness :  3.0802211440821394
Photography :  2.944826808078529
News & Magazines :  2.798149610741284
Social :  2.6627552747376737
Travel & Local :  2.324269434728647
Shopping :  2.245289405393208
Books & Reference :  2.1437436533904997
Simulation :  2.042197901387792
Dating :  1.8616721200496444
Arcade :  1.8503892587160102
Video Players & Editors :  1.771409229380571
Casual :  1.7601263680469368
Maps & Navigation :  1.399074805370642
Food & Drink :  1.241114746699763
Puzzle :  1.128286133363421
Racing :  0.9928917973598104
Role Playing :  0.9364774906916393
Libraries & Demo :  0.9364774906916393
Auto & V

In [35]:
##frequency table for android apps based on type

display_table(final_android_free,1)

FAMILY :  18.898792733837304
GAME :  9.725826469592688
TOOLS :  8.462146000225657
BUSINESS :  4.592124562789123
LIFESTYLE :  3.9038700214374367
PRODUCTIVITY :  3.8925871601038025
FINANCE :  3.7007785174320205
MEDICAL :  3.5315355974275078
SPORTS :  3.396141261423897
PERSONALIZATION :  3.317161232088458
COMMUNICATION :  3.2381812027530184
HEALTH_AND_FITNESS :  3.0802211440821394
PHOTOGRAPHY :  2.944826808078529
NEWS_AND_MAGAZINES :  2.798149610741284
SOCIAL :  2.6627552747376737
TRAVEL_AND_LOCAL :  2.335552296062281
SHOPPING :  2.245289405393208
BOOKS_AND_REFERENCE :  2.1437436533904997
DATING :  1.8616721200496444
VIDEO_PLAYERS :  1.7939749520478394
MAPS_AND_NAVIGATION :  1.399074805370642
FOOD_AND_DRINK :  1.241114746699763
EDUCATION :  1.1621347173643235
ENTERTAINMENT :  0.9590432133589079
LIBRARIES_AND_DEMO :  0.9364774906916393
AUTO_AND_VEHICLES :  0.9251946293580051
HOUSE_AND_HOME :  0.8236488773552973
WEATHER :  0.8010831546880289
EVENTS :  0.7108202640189552
PARENTING :  0.65440

# iOS DATA

In [37]:
##frequency table for ios apps

display_table(final_ios_free,11)

Games :  58.13664596273293
Entertainment :  7.888198757763975
Photo & Video :  4.968944099378882
Education :  3.6645962732919255
Social Networking :  3.291925465838509
Shopping :  2.608695652173913
Utilities :  2.515527950310559
Sports :  2.142857142857143
Music :  2.049689440993789
Health & Fitness :  2.018633540372671
Productivity :  1.7391304347826086
Lifestyle :  1.5838509316770186
News :  1.3354037267080745
Travel :  1.2422360248447204
Finance :  1.1180124223602486
Weather :  0.8695652173913043
Food & Drink :  0.8074534161490683
Reference :  0.5590062111801243
Business :  0.5279503105590062
Book :  0.43478260869565216
Navigation :  0.18633540372670807
Medical :  0.18633540372670807
Catalogs :  0.12422360248447205


Calculating average number of installs for each genre of App Store apps depending upon the toatal number of user ratings

In [48]:

average_ios={}
for app in final_ios_free:
    genre=app[-5]
    ratings=app[5]
    if genre in average_ios:
        average_ios[genre] += float(ratings)
    else:
        average_ios[genre]=float(ratings)

freq_ios=freqtabletemp(final_ios_free,11)

for app in average_ios:
    print(app,":",average_ios[app]/freq_ios[app])

Utilities : 18684.456790123455
News : 21248.023255813954
Medical : 612.0
Shopping : 26919.690476190477
Photo & Video : 28441.54375
Travel : 28243.8
Productivity : 21028.410714285714
Finance : 31467.944444444445
Food & Drink : 33333.92307692308
Navigation : 86090.33333333333
Education : 7003.983050847458
Business : 7491.117647058823
Sports : 23008.898550724636
Book : 39758.5
Entertainment : 14029.830708661417
Health & Fitness : 23298.015384615384
Catalogs : 4004.0
Reference : 74942.11111111111
Lifestyle : 16485.764705882353
Social Networking : 71548.34905660378
Weather : 52279.892857142855
Games : 22812.92467948718
Music : 57326.530303030304


On average, navigation apps have the highest number of user reviews, but this figure is heavily influenced by Waze and Google Maps, which have close to half a million user reviews together:



In [61]:
for app in final_ios_free:
    if app[-5] == 'Navigation':
        print(app[1], ':', app[5])

Waze - GPS Navigation, Maps & Real-time Traffic : 345046
Google Maps - Navigation & Transit : 154911
Geocaching® : 12811
CoPilot GPS – Car Navigation & Offline Maps : 3582
ImmobilienScout24: Real Estate Search in Germany : 187
Railway Route Search : 5


Calculating average number of installs for each genre of Google Play Store apps depending upon the toatal number of Installs

# ANDROID DATA

In [46]:
average_android={}
import re
for app in final_android_free:
    genre=app[1]
    
    installs=app[5].replace(',','')
    installs=installs.replace('+','')
    installs=float(installs)
    
    if genre in average_android:
        average_android[genre] += float(installs)
    else:
        average_android[genre]= float(installs)

freq_android=freqtabletemp(final_android_free,1)

for app in average_android:
    print(app,":",average_android[app]/freq_android[app])

EDUCATION : 1833495.145631068
BOOKS_AND_REFERENCE : 8767811.894736841
COMICS : 817657.2727272727
MAPS_AND_NAVIGATION : 4056941.7741935486
NEWS_AND_MAGAZINES : 9549178.467741935
LIBRARIES_AND_DEMO : 638503.734939759
TOOLS : 10801391.298666667
VIDEO_PLAYERS : 24727872.452830188
ART_AND_DESIGN : 1986335.0877192982
PRODUCTIVITY : 16787331.344927534
TRAVEL_AND_LOCAL : 13984077.710144928
BUSINESS : 1712290.1474201474
ENTERTAINMENT : 11640705.88235294
GAME : 15588015.603248259
LIFESTYLE : 1437816.2687861272
PARENTING : 542603.6206896552
PERSONALIZATION : 5201482.6122448975
MEDICAL : 120550.61980830671
EVENTS : 253542.22222222222
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
HEALTH_AND_FITNESS : 4188821.9853479853
SHOPPING : 7036877.311557789
DATING : 854028.8303030303
HOUSE_AND_HOME : 1331540.5616438356
WEATHER : 5074486.197183099
FAMILY : 3697848.1731343283
COMMUNICATION : 38456119.167247385
SOCIAL : 23253652.127118643
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.736363

On average, communication apps have the most installs: 38,456,119. This number is heavily skewed up by a few apps that have over one billion installs (WhatsApp, Facebook Messenger, Skype, Google Chrome, Gmail, and Hangouts), and a few others with over 100 and 500 million installs:



In [57]:
for app in final_android_free:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
imo beta free calls and text : 100,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Messenger – Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Skype - free IM & video calls : 1,000,000,000+
Who : 100,000,000+
GO SMS Pro - Messenger, Free Themes, Emoji : 100,000,000+
LINE: Free Calls & Messages : 500,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Firefox Browser fast & private : 100,000,000+
UC Browser - Fast Download Private & Secure : 500,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Messenger Lite: Free Calls & Messages : 100,000,000+
Kik : 100,000,000+
KakaoTalk: Free Calls & Text : 100,000,000+
Opera Mini - fast web browser : 100,000,000+
Opera Browser: Fast and Secure : 100,000,000+
Telegram : 100,000,000+
Truecaller: Caller ID, SMS spam blocking & Dialer : 100,000,000+
UC Browser Mini -Tiny Fast Private & Secure : 100,000,000+
Viber Mess


If we removed all the communication apps that have over 100 million installs, the average would be reduced roughly ten times:




In [59]:
under_100_m = []

for app in final_android_free:
    n_installs = app[5]
    n_installs = n_installs.replace(',', '')
    n_installs = n_installs.replace('+', '')
    if (app[1] == 'COMMUNICATION') and (float(n_installs) < 100000000):
        under_100_m.append(float(n_installs))
        
sum(under_100_m) / len(under_100_m)

3603485.3884615386

# Conclusions
In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.

