# First Project: Porfitable App Profiles for the App Store and Google Play Markets

In this mini project we look at data from the GooglePlay store and the Apple store. First, we have to clean the data and then

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
from csv import reader

In [3]:
AppleStore_open = open('AppleStore.csv')
GooglePlay_open = open('googleplaystore.csv')
AppleStore_all = list(reader(AppleStore_open))
GooglePlay_all = list(reader(GooglePlay_open))
AppleStore_header = AppleStore_all[0]
AppleStore_data = AppleStore_all[1:]
GooglePlay_header = GooglePlay_all[0]
GooglePlay_data = GooglePlay_all[1:]

In [4]:
AppleStore_header

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [5]:
GooglePlay_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [6]:
explore_data(AppleStore_data,1,3, True)

['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


In [7]:
explore_data(GooglePlay_data,1,3,True)

['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


## data cleaning:

It is said in the discussion section that row 10472 of the GooglePlay data has an error:

In [8]:
GooglePlay_data[10472]

['Life Made WI-Fi Touchscreen Photo Frame',
 '1.9',
 '19',
 '3.0M',
 '1,000+',
 'Free',
 '0',
 'Everyone',
 '',
 'February 11, 2018',
 '1.0.19',
 '4.0 and up']

It looks like the first entry is missing and all entries shifted. We will delete this row:

In [9]:
del GooglePlay_data[10472]

The next step is to delete duplicates:

In [10]:
GooglePlay_no_dupl = []
GooglePlay_dupl = []
for row in GooglePlay_data:
    name = row[1]
    if name in GooglePlay_no_dupl:
        GooglePlay_dupl.append(row)
    else:
        GooglePlay_no_dupl.append(row)

The next task is to create a dictionary: the keys are the unique app names and the values are the  highest number of reviews of that app. This is a different way to remove duplicates:

In [11]:
reviews_max = {}
for row in GooglePlay_data:
    name = row[0]
    n_reviews = float(row[3])
    if (name in reviews_max) and (reviews_max[name] < n_reviews):
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [12]:
android_clean = []
already_added = []
for row in GooglePlay_data:
    name = row[0]
    n_reviews = float(row[3])
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(row)
        already_added.append(name)

In [13]:
explore_data(android_clean,1,3, True)

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


The analysis is done on free apps available in English. Hence, rows with characters that are hint at a different language than English will be deleted. These characters will be found using the ascii code funtion `ord()`:

In [14]:
(ord('z'),chr(123),ord('A'),chr(64),ord('0'),ord('9'),chr(47), chr(58), ord(' '))

(122, '{', 65, '@', 48, 57, '/', ':', 32)

It seems best to discard any row of which the second character has an ascii code above 127:

In [15]:
AppleStore_data_English = []
row_list = []
for num, row in enumerate(AppleStore_data):
    name = row[1]
    for letter in range(1,2):
        if  ord(name[letter]) > 127  :
            #print(num)
            #print(name[letter],ord(name[letter]))
            row_list.append(num)
        else:
            AppleStore_data_English.append(row)
          
        
GooglePlay_data_English = []
row_list2 = []
for num, row in enumerate(android_clean):
    name = row[1]
    for letter in range(1,2):
        if  ord(name[letter]) > 127  :
            #print(num)
            #print(name[letter],ord(name[letter]))
            row_list2.append(num)
        else:
            GooglePlay_data_English.append(row)
          

In [16]:
GooglePlay_data_English[:1]

[['Photo Editor & Candy Camera & Grid & ScrapBook',
  'ART_AND_DESIGN',
  '4.1',
  '159',
  '19M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'January 7, 2018',
  '1.0.0',
  '4.0.3 and up']]

The next step is to isolate free apps in a separate list:

In [17]:
android_free = []
Apple_free = []
for row in GooglePlay_data_English:
    price = row[7]
    if price == '0':
        android_free.append(row)

for row in AppleStore_data_English:
    price = row[4]
    if price == '0.0':
        Apple_free.append(row)

In [18]:
#AppleStore_data_English[3]

In [19]:
(len(android_free), len(Apple_free))

(8905, 3302)

Now we calculate a frequency table. We are interested how many apps are attributed to each genre. We define two general functions to create a frequency table and to display it:

In [20]:
def freq_table(dataset, index):
    freq_dict = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in freq_dict:
            freq_dict[value] += 1
        else:
            freq_dict[value] = 1
    
    freq_dict_percentages = {}
    for key in freq_dict:
        percentage = (freq_dict[key]/total)*100
        freq_dict_percentages[key] = percentage 
    
    return freq_dict_percentages


def display_table(dataset, index):
    freq_dict = freq_table(dataset, index)
    freq_display = []
    for key in freq_dict:
        key_val_as_tuple = (freq_dict[key], key)
        freq_display.append(key_val_as_tuple)
        
    freq_sorted = sorted(freq_display, reverse = True)
    for entry in freq_sorted:
        print(entry[1], ':', entry[0])

We now examine the dataset `android_free` and `Apple_free` using the definitions from above:

In [21]:
display_table(Apple_free, -5)

Games : 57.60145366444579
Entertainment : 7.995154451847365
Photo & Video : 4.875832828588734
Education : 3.573591762568141
Social Networking : 3.3615990308903694
Utilities : 2.695336159903089
Shopping : 2.543912780133253
Sports : 2.0896426408237434
Health & Fitness : 2.059357964869776
Music : 2.0290732889158085
Productivity : 1.756511205330103
Lifestyle : 1.6656571774682012
News : 1.362810417928528
Finance : 1.2113870381586918
Travel : 1.1811023622047243
Food & Drink : 0.8782556026650515
Weather : 0.8479709267110842
Book : 0.5754088431253785
Reference : 0.514839491217444
Business : 0.514839491217444
Navigation : 0.27256208358570566
Medical : 0.21199273167777105
Catalogs : 0.18170805572380377


In [22]:
display_table(android_free, 1)

FAMILY : 18.97810218978102
GAME : 9.70241437394722
TOOLS : 8.433464345873105
BUSINESS : 4.581695676586187
LIFESTYLE : 3.9303761931499155
PRODUCTIVITY : 3.885457608085345
FINANCE : 3.6833239752947784
MEDICAL : 3.5148792813026386
SPORTS : 3.3801235261089273
PERSONALIZATION : 3.312745648512072
COMMUNICATION : 3.2341381246490735
HEALTH_AND_FITNESS : 3.065693430656934
PHOTOGRAPHY : 2.9421673217293653
NEWS_AND_MAGAZINES : 2.829870859067939
SOCIAL : 2.6501965188096577
TRAVEL_AND_LOCAL : 2.3245367770915215
SHOPPING : 2.2459292532285233
BOOKS_AND_REFERENCE : 2.1785513756316677
DATING : 1.8528916339135317
VIDEO_PLAYERS : 1.7967434025828188
MAPS_AND_NAVIGATION : 1.4149354295339696
FOOD_AND_DRINK : 1.235261089275688
EDUCATION : 1.167883211678832
ENTERTAINMENT : 0.9545199326221224
LIBRARIES_AND_DEMO : 0.9320606400898372
AUTO_AND_VEHICLES : 0.9208309938236946
HOUSE_AND_HOME : 0.8197641774284109
WEATHER : 0.7973048848961257
EVENTS : 0.7074677147669848
PARENTING : 0.6513194834362718
ART_AND_DESIGN : 0

We see that the most apps from the Apple store are apps that belong to the genre "Games", wheres most apps from GooglePlay belong to the genre "Family" followed by "Games". However, we see that iOS and android have different genres and therefore the comparison is not optimal.

New we play examine the data for the most popular genre:

In [23]:
prime_genre = freq_table(Apple_free, -5)

for genre in prime_genre:
    total = 0
    len_genre = 0
    for item in Apple_free:
        genre_app = item[-5]
        if genre_app == genre:            
            n_ratings = float(item[5])
            total += n_ratings
            len_genre += 1
    average_ratings = total / len_genre
    print(genre, ':', average_ratings)

Medical : 525.4285714285714
Finance : 28322.4
Food & Drink : 29886.931034482757
Music : 56482.02985074627
News : 20303.666666666668
Education : 7003.983050847458
Weather : 52279.03571428572
Catalogs : 2669.3333333333335
Health & Fitness : 22278.352941176472
Social Networking : 68341.3063063063
Reference : 79350.4705882353
Games : 22453.729758149315
Travel : 28959.5641025641
Photo & Video : 28264.888198757762
Productivity : 20303.310344827587
Business : 7491.117647058823
Utilities : 17058.719101123595
Lifestyle : 15296.236363636364
Sports : 23003.985507246376
Shopping : 26902.52380952381
Navigation : 57393.555555555555
Entertainment : 13498.549242424242
Book : 29310.736842105263


It seemd "Reference" followed by "Social Networking" are the most popular ones.

For GooglePlay we look at "category" instead of "genre":

In [24]:
categories_android = freq_table(android_free, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for item in android_free:
        category_app = item[1]
        if category_app == category:            
            installs = item[5]
            installs = installs.replace(',', '')
            installs = installs.replace('+', '')
            total += float(installs)
            len_category += 1
    average_installs = total / len_category
    print(category, ':', average_installs)

ENTERTAINMENT : 11640705.88235294
PARENTING : 542603.6206896552
TOOLS : 10787009.952063914
MAPS_AND_NAVIGATION : 3993339.603174603
FINANCE : 1387692.475609756
LIBRARIES_AND_DEMO : 638503.734939759
EVENTS : 253542.22222222222
BOOKS_AND_REFERENCE : 8587351.855670104
GAME : 15551995.891203703
COMMUNICATION : 38322625.697916664
SPORTS : 3638640.1428571427
WEATHER : 5074486.197183099
SOCIAL : 23253652.127118643
PHOTOGRAPHY : 17772018.759541985
VIDEO_PLAYERS : 24573948.25
TRAVEL_AND_LOCAL : 13984077.710144928
ART_AND_DESIGN : 1952105.1724137932
BEAUTY : 513151.88679245283
AUTO_AND_VEHICLES : 647317.8170731707
FOOD_AND_DRINK : 1924897.7363636363
FAMILY : 3668870.823076923
EDUCATION : 1825480.7692307692
HEALTH_AND_FITNESS : 4188821.9853479853
PERSONALIZATION : 5183850.806779661
COMICS : 803234.8214285715
BUSINESS : 1708215.906862745
MEDICAL : 120550.61980830671
SHOPPING : 7001693.425
HOUSE_AND_HOME : 1331540.5616438356
PRODUCTIVITY : 16738957.554913295
NEWS_AND_MAGAZINES : 9401635.952380951
LI

"Communication" is doubtlessly leading in numbers of downloads.

In summary, we looked at data from the GooglePlay store and the Apple store. First we cleaned the data by deleting duplicates and discarding apps that are not English. Finally, we did some data analysis by looking at the popularity of the different genres and categories.