In [12]:
## Packages needed in this project
from csv import reader

# 1) Opening and Exploring the Data

### Writing a function to explore the datasets

In [13]:
def explore_data(dataset, start, end, rows_and_columns=False, empty_lines=False):
    """
    A function to explore the various datasets. This function takes following arguments: 

    Arguments: 
        - dataset (list): list of lists of all data contained in a dataset. 
        - start (int): start index for the "dataset" list. 
        - end (int): end index for the "dataset" list. 
        - rows_and_columns (boolen): prints number of rows and columns if true (default: false).
        - empty_lines (boolean): adds new empty line after each row (list) in the dataset list if true (default: false).
        
    Returns: 
        - a slice of the "dataset" list (dataset[start:end]). 
    """
    
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row) 
        if empty_lines:
            print("\n") # \n adds new empty line after each row 

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

### Opening both datasets and saving them as list of lists

In [14]:
apple_store_file = open("AppleStore.csv", encoding="utf8")
google_store_file = open("googleplaystore.csv", encoding="utf8")

apple_store_reader = reader(apple_store_file)
google_store_reader = reader(google_store_file)

apple_store_list = list(apple_store_reader)
google_store_list = list(google_store_reader)

apple_store_header =  list(apple_store_list)[0] # header row of dataset
apple_store = list(apple_store_list)[1:] # dataset without header row 
google_store_header = list(google_store_list)[0] # header row of dataset
google_store = list(google_store_list)[1:] # dataset without header row 

### Explore both datasets using the explore_data() function



In [15]:
print("Apple store:")
explore_data(apple_store, 0, 3, True)

print("-----------------------------")

print("Google play store:")
explore_data(google_store, 0, 3, True)

Apple store:
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']
['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']
['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']
Number of rows: 7197
Number of columns: 16
-----------------------------
Google play store:
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', '

### Print the column names, and try to identify the columns that could help us with our analysis.

In [16]:
print(apple_store_header)
print(google_store_header)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Documentation for the description of the dataset columns: [Apple Store Dataset](https://www.kaggle.com/datasets/ramamet4/app-store-apple-data-set-10k-apps), [Google Play Store Dataset](https://www.kaggle.com/datasets/lava18/google-play-store-apps)


Useful columns for our analysis in the apple store dataset: 
* **track_name**
* **currency**
* **price**
* **rating_count_tot**
* **user_rating**
* **prime_genre**
* **cont_rating**

Useful columns for our analysis in the google play store dataset: 
* **App** (Application name)
* **Category** (Category the app belongs to)
* **Rating** (Overall user rating of the app (as when scraped))
* **Reviews** (Number of user reviews for the app (as when scraped))
* **Installs** (Number of user downloads/installs for the app (as when scraped))
* **Type** (Paid or Free)
* **Price** (Price of the app (as when scraped))
* **Genres** (An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres.)


# 2) Deleting Wrong Data


[One of the discussions](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015) on the google play store dataset on kaggle outline an error for row 10472. Therefore we will write a function to check if there are any row that is incorrect. (i.e. missing values, etc.)

In [17]:
def check_data(store, header):
    """
    This function checks if the length of a row is identical with the length of the header of the dataset. 

    Arguments: 
        - Store (list): list of lists of all data contained in a dataset. 
        - header (list): header list of the dataset. 
        
    Returns: 
        - a list of incorrect data (rows). 
    """
    error_rows = list()
    for row in store:
        if len(row) != len(header):
            error_rows.append(store.index(row))
    return error_rows


error_rows_apple = check_data(apple_store, apple_store_header)
error_rows_google = check_data(google_store, google_store_header)

print(error_rows_apple, error_rows_google)

[] [10472]


Removing all incorrect rows. 

In [18]:
def remove_rows(store, error_rows, store_name):
    """
    This function delets wrong rows. 

    Arguments: 
        - Store (list): list of lists of all data contained in a dataset. 
        - error_rows (list): a list of incorrect data (rows).
        - store_name (string): name of the store dataset. 
    """
    num_removed_rows = 0
    for row_index in error_rows:
        num_removed_rows += 1
        del store[row_index]

    print(f"Number of removed rows from the {store_name}: {num_removed_rows}")

remove_rows(google_store, error_rows_google, "Google Play Store")
remove_rows(apple_store, error_rows_apple, "Apple Store")

Number of removed rows from the Google Play Store: 1
Number of removed rows from the Apple Store: 0


# 3) Removing Duplicate Entries

Some apps have duplicate entries in the dataset. Here is an example:

In [19]:
for app in google_store:
    name = app[0]
    if name == "Instagram":
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


Writing a function which finds the number of duplicate entries in each store. 

In [20]:
def duplicate_apps(store, store_name):
    """
    This function finds the number of duplicate entries in a store dataset. 

    Arguments: 
        - Store (list): list of lists of all data contained in a dataset. 
        - store_name (string): name of the store dataset. 
    """
    duplicate_apps = list()
    unique_apps = list()
    for app in store:
        name = app[0]
        if name in unique_apps:
            duplicate_apps.append(name)
        else:
            unique_apps.append(name)

    num_of_duplicates = len(duplicate_apps)
    print(f"Number of duplicate entries in the {store_name} dataset: {num_of_duplicates}")

duplicate_apps(google_store, "Google Play Store")
duplicate_apps(apple_store, "Apple Store")

Number of duplicate entries in the Google Play Store dataset: 1181
Number of duplicate entries in the Apple Store dataset: 0


The Apple Store dataset has no duplicated, therefore we will only hanlde the duplicates in the Google Play Store dataset. 

We could remove the duplicate entries randomly, but I would better use a another criterion to remove the duplicates. As we can see in the four "Instegram" entries before, the main difference happens on the fourth position of each row, which corresponds to the number of reviews. So, we will keep the entry with the highest number of reviews and remove the rest. 

In [43]:
def remove_duplicates(store, store_name):
    """
    This function finds the duplicate entries in a store dataset and removes them. 

    Arguments: 
        - Store (list): list of lists of all data contained in a dataset. 
        - store_name (string): name of the store dataset. 
    """
    duplicate_apps = list()
    num_duplicates = dict()
    reviews_max = dict()
    for index, app in enumerate(store):
        name = app[0]
        reviews = float(app[3])
        if name in reviews_max and reviews < reviews_max[name][1]:
            duplicate_apps.append(store[index])
            num_duplicates[name] += 1
        else:
            if name in reviews_max:
                duplicate_apps.append(store[reviews_max[name][0]])
                reviews_max[name] = (index, reviews)
                num_duplicates[name] += 1
            else:
                reviews_max[name] = (index, reviews)
                num_duplicates[name] = 0

    print("Before removing duplicate entries:")
    print(f"Number of duplicate entries in the {store_name} dataset: {len(duplicate_apps)} (total entries: {len(store)})")
    
    clean_dataset = list()
    already_added = list()
    for row in store:
        name = row[0]
        reviews = float(row[3])
        if reviews == reviews_max[name][1] and name not in already_added:
            clean_dataset.append(row)
            already_added.append(name)
    
    print("After removing duplicate entries:")
    print(f"Number of entries in the {store_name} dataset after removing duplicate entries: {len(clean_dataset)}")

    return clean_dataset

google_store_clean = remove_duplicates(google_store, "Google Play Store")

Before removing duplicate entries:
Number of duplicate entries in the Google Play Store dataset: 1181 (total entries: 10840)
After removing duplicate entries:
Number of entries in the Google Play Store dataset after removing duplicate entries: 9659


let's explore the new (clean) dataset using the explore_data function. 

In [42]:
explore_data(google_store_clean, 0, 3, True, False)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']
Number of rows: 9659
Number of columns: 13


# 4) Removing Non-English Apps

In our analysis we are only interested in apps designed for an English-speaking audience. if we explore the data long enough, we'll find that both datasets have apps with names that suggest they are not designed for an English-speaking audience.

In [50]:
print(apple_store[813][1])
print(apple_store[6731][1])
print("\n")
print(google_store_clean[4412][0])
print(google_store_clean[7940][0])

爱奇艺PPS -《欢乐颂2》电视剧热播
【脱出ゲーム】絶対に最後までプレイしないで 〜謎解き＆ブロックパズル〜


中国語 AQリスニング
لعبة تقدر تربح DZ


As we are targeting only English-speaking audience, we are not interested in these apps. So we will remove them from the datasets. We will write a function which checks if a string (name of an app) belong to the set of common English characters. 

In [69]:
def is_eng(string):
    """
    A function that checks if a given string consists of English characters.
    
    Args:
        - string (str): A string to be checked for English characters.
    
    Returns:
        - boolean: True if the string consists of only English characters, False otherwise.
    """

    # Check if the Unicode code point of the character is greater than 127
    # This check is based on the assumption that the ASCII code points cover all English characters
    # Characters outside this range may be non-English, special characters or non-printable characters
    for char in string:
        if ord(char) > 127:
            return False
    return True

print(is_eng("Instagram"))
print(is_eng('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_eng('Docs To Go™ Free Office Suite'))
print(is_eng('Instachat 😜'))

True
False
False
False


If we use the function we've created, we'll lose useful data since many English apps will be incorrectly labeled as non-English, as there are many emojis and character that fall outside the ASCII range of 1 to 127. 

In [68]:
print(ord("😜"))
print(ord("™"))

128540
8482


To minimize the impact of data loss, we'll only remove an app if its name has more than three characters with corresponding numbers falling outside the ASCII range.

In [73]:
def is_eng_modified(string):
    """
    A function that checks if a given string consists of English characters.
    
    Args:
        - string (str): A string to be checked for English characters.
    
    Returns:
        - boolean: False if the string consists of more than three non-english characters, True otherwise.
    """
    
    non_eng = list()
    for char in string:
        if ord(char) > 127:
            non_eng.append(char) 

    if len(non_eng) > 3:
        return False
    else:
        return True

print(is_eng_modified("Instagram"))
print(is_eng_modified('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_eng_modified('Docs To Go™ Free Office Suite'))
print(is_eng_modified('Instachat 😜'))

True
False
True
True


We will now use the new function to filter out non-English apps from both datasets.

In [84]:
def reomve_non_eng(store):
    """
    A function that remove non-english apps.
    
    Args:
        - Store (list): list of lists of all data contained in the dataset. 
    
    Returns:
        - clean_store (list): list of lists of all english apps contained in the dataset. 
    """
        
    clean_store = list()
    if store == google_store_clean:
        for row in store:
            name = row[0]
            if is_eng_modified(name):
                clean_store.append(row)
    elif store == apple_store:
        for row in store:
            name = row[1]
            if is_eng_modified(name):
                clean_store.append(row)
    return clean_store


google_store_eng = reomve_non_eng(google_store_clean)
apple_store_eng = reomve_non_eng(apple_store)

print(f"Number of apps in the Google Play Store dataset (incl. non-english apps): {len(google_store_clean)}")
print(f"Number of apps in the Google Play Store dataset (only english apps): {len(google_store_eng)}")
print("-"*70)
print(f"Number of apps in the Apple Store dataset (incl. non-english apps): {len(apple_store)}")
print(f"Number of apps in the Apple Store dataset (only english apps): {len(apple_store_eng)}")

Number of apps in the Google Play Store dataset (incl. non-english apps): 9659
Number of apps in the Google Play Store dataset (only english apps): 9614
----------------------------------------------------------------------
Number of apps in the Apple Store dataset (incl. non-english apps): 7197
Number of apps in the Apple Store dataset (only english apps): 6183


# 5) Isolating the Free Apps

We only build apps that are free to download and install, and our main source of revenue consists of in-app ads. Our datasets contain both free and non-free apps; we'll need to isolate only the free apps for our analysis.

In [99]:
def remove_paid(store):
    clean_store = list()
    if store == google_store_eng:
        for row in store:
            paid_or_free = row[6]
            if paid_or_free == "Free":
                clean_store.append(row)
    elif store == apple_store_eng:
        for row in store:
            price = row[4]
            if float(price) == 0.0:
                clean_store.append(row)
    return clean_store

google_store_final = remove_paid(google_store_eng)
apple_store_final = remove_paid(apple_store_eng)

print(f"Number of apps in the Google Play Store dataset (incl. paid apps): {len(google_store_eng)}")
print(f"Number of apps in the Google Play Store dataset (only free apps): {len(google_store_final)}")
print("-"*70)
print(f"Number of apps in the Apple Store dataset (incl. paid apps): {len(apple_store_eng)}")
print(f"Number of apps in the Apple Store dataset (only free apps): {len(apple_store_final)}")

Number of apps in the Google Play Store dataset (incl. paid apps): 9614
Number of apps in the Google Play Store dataset (only free apps): 8863
----------------------------------------------------------------------
Number of apps in the Apple Store dataset (incl. paid apps): 6183
Number of apps in the Apple Store dataset (only free apps): 3222


# 6) Most Common Apps by Genre

Our goal is to determine the kinds of apps that are likely to attract more users because the number of people using our apps affect our revenue. 
To minimize risks and overhead, our validation strategy for an app idea has three steps:
* Build a minimal Android version of the app, and add it to Google Play.
* If the app has a good response from users, we develop it further.
* If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

We want now to know which columns in both datasets can be used to determine the most common apps by genres. 

In [140]:
print(google_store_header)
print(apple_store_header)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


From the header of both datasets we can conclude that the column "Category" and "prime_genre" for the Google Play Store dataset and Apple Store dataset, recpectively, can be used for the specified perpuose. 

We'll build two functions we can use to analyze the frequency tables:

* One function to generate frequency tables that show percentages
* Another function we can use to display the percentages in a descending order

In [193]:
def freq_table(dataset, index):
    frequency_table = dict()
    total = 0

    for row in dataset:
        total += 1
        data = row[index]
        if data in frequency_table:
            frequency_table[data] += 1 
        else:
            frequency_table[data] = 1 
    
    table_percent = dict()
    for key in frequency_table:
        percentage = (frequency_table[key] / total) * 100
        table_percent[key] = round(percentage, 2) 

    return table_percent

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0], "%")
    return table_sorted


Now we will analyze the frequency table we generated for the prime_genre column of the App Store dataset. We will answer the following questions. 
* What is the most common genre? What is the next most common?
* What is the general impression — are most of the apps designed for practical purposes (education, shopping, utilities, productivity, lifestyle) or more for entertainment (games, photo and video, social networking, sports, music)?
* Can you recommend an app profile for the App Store market based on this frequency table alone? If there's a large number of apps for a particular genre, does that also imply that apps of that genre generally have a large number of users?


In [197]:
apple_prime_genre = display_table(apple_store_final, apple_store_header.index("prime_genre"))

Games : 58.16 %
Entertainment : 7.88 %
Photo & Video : 4.97 %
Education : 3.66 %
Social Networking : 3.29 %
Shopping : 2.61 %
Utilities : 2.51 %
Sports : 2.14 %
Music : 2.05 %
Health & Fitness : 2.02 %
Productivity : 1.74 %
Lifestyle : 1.58 %
News : 1.33 %
Travel : 1.24 %
Finance : 1.12 %
Weather : 0.87 %
Food & Drink : 0.81 %
Reference : 0.56 %
Business : 0.53 %
Book : 0.43 %
Navigation : 0.19 %
Medical : 0.19 %
Catalogs : 0.12 %


In [205]:
# What is the most common genre? What is the next most common?
print(f"The most common genre in the Apple Store dataset: {apple_prime_genre[0][1]} ({apple_prime_genre[0][0]}% of the free english apps)")
print(f"The next most common genre in the Apple Store dataset: {apple_prime_genre[1][1]} ({apple_prime_genre[1][0]}% of the free english apps)")

The most common genre in the Apple Store dataset: Games (58.16% of the free english apps)
The next most common genre in the Apple Store dataset: Entertainment (7.88% of the free english apps)



* What is the most common genre? What is the next most common? - **The 'Games' genre is the most common genre in the Apple Store. The next common genre is the 'Entertainment' genre.**
* What is the general impression — are most of the apps designed for practical purposes (education, shopping, utilities, productivity, lifestyle) or more for entertainment (games, photo and video, social networking, sports, music)? - **The most common kind of apps are the apps designed for fun (games, entertainment, photo and video, social networking, sports, music, etc.).**
* Can you recommend an app profile for the App Store market based on this frequency table alone? If there's a large number of apps for a particular genre, does that also imply that apps of that genre generally have a large number of users? - **Yes, I would recommend to build an app that belongs to 'Games' genre. No, the large number for a particular genre dose not imply that the apps of that genre have a large number of users, the demand might not be the same as the offer.**

In [208]:
google_category = display_table(google_store_final, google_store_header.index("Category"))

FAMILY : 18.9 %
GAME : 9.73 %
TOOLS : 8.46 %
BUSINESS : 4.59 %
LIFESTYLE : 3.9 %
PRODUCTIVITY : 3.89 %
FINANCE : 3.7 %
MEDICAL : 3.53 %
SPORTS : 3.4 %
PERSONALIZATION : 3.32 %
COMMUNICATION : 3.24 %
HEALTH_AND_FITNESS : 3.08 %
PHOTOGRAPHY : 2.94 %
NEWS_AND_MAGAZINES : 2.8 %
SOCIAL : 2.66 %
TRAVEL_AND_LOCAL : 2.34 %
SHOPPING : 2.25 %
BOOKS_AND_REFERENCE : 2.14 %
DATING : 1.86 %
VIDEO_PLAYERS : 1.79 %
MAPS_AND_NAVIGATION : 1.4 %
FOOD_AND_DRINK : 1.24 %
EDUCATION : 1.16 %
ENTERTAINMENT : 0.96 %
LIBRARIES_AND_DEMO : 0.94 %
AUTO_AND_VEHICLES : 0.93 %
HOUSE_AND_HOME : 0.82 %
WEATHER : 0.8 %
EVENTS : 0.71 %
PARENTING : 0.65 %
ART_AND_DESIGN : 0.64 %
COMICS : 0.62 %
BEAUTY : 0.6 %


In [212]:
# What is the most common genre? What is the next most common?
print(f"The most common genre in the Google Play Store dataset: {google_category[0][1]} ({google_category[0][0]}% of the free english apps)")
print(f"The next most common genre in the Google Play Store dataset: {google_category[1][1]} ({google_category[1][0]}% of the free english apps)")

The most common genre in the Google Play Store dataset: FAMILY (18.9% of the free english apps)
The next most common genre in the Google Play Store dataset: GAME (9.73% of the free english apps)


* What are the most common category? - **The 'Family' genre is the most common category in the Apple Store. The next common category is the 'Game' category.**
* Compare the patterns you see for the Google Play market with those you saw for the App Store market. - **The Google Play Store shows a more balanced landscape of both practical and for-fun apps.**
* Can you recommend an app profile based on what you found so far? Do the frequency tables you generated reveal the most frequent app genres or what genres have the most users? - **Yes, I would recommend to build an app thats belongs to the 'Family' category. The frequency tables we generated only reveal the most common app genres.**

# 7) Most Popular Apps by Genre on the App Store

Now, we'd like to determine the kind of apps with the most users. For the Google Play data set, we can find this information in the Installs column, but this information is missing for the App Store data set. As a workaround, we'll take the total number of user ratings as a proxy, which we can find in the rating_count_tot app.

Let's start with calculating the average number of user ratings per app genre on the App Store. To do that, we'll need to do the following:
* Isolate the apps of each genre
* Add up the user ratings for the apps of that genre
* Divide the sum by the number of apps belonging to that genre (not by the total number of apps)

In [250]:
#apple_prime_genre = display_table(apple_store_final, apple_store_header.index("prime_genre"))
apple_genres = freq_table(apple_store_final, apple_store_header.index("prime_genre"))

genre_ave_users = list()
for genre in apple_genres:
    total = 0
    len_genre = 0
    for row in apple_store_final:
        genre_app = row[apple_store_header.index("prime_genre")]
        if genre == genre_app:
            num_users = float(row[apple_store_header.index("rating_count_tot")])
            total += num_users
            len_genre += 1
    ave_users_rating = total / len_genre
    genre_ave_users.append((round(ave_users_rating,2), genre))
    #print(f"The average number of user ratings for the '{genre}' genre: {ave_users_rating:.2f}")

for avg, genre in sorted(genre_ave_users, reverse=True):
    print(f"Average number of user ratings for '{genre}' genre: {avg}")

Average number of user ratings for 'Navigation' genre: 86090.33
Average number of user ratings for 'Reference' genre: 74942.11
Average number of user ratings for 'Social Networking' genre: 71548.35
Average number of user ratings for 'Music' genre: 57326.53
Average number of user ratings for 'Weather' genre: 52279.89
Average number of user ratings for 'Book' genre: 39758.5
Average number of user ratings for 'Food & Drink' genre: 33333.92
Average number of user ratings for 'Finance' genre: 31467.94
Average number of user ratings for 'Photo & Video' genre: 28441.54
Average number of user ratings for 'Travel' genre: 28243.8
Average number of user ratings for 'Shopping' genre: 26919.69
Average number of user ratings for 'Health & Fitness' genre: 23298.02
Average number of user ratings for 'Sports' genre: 23008.9
Average number of user ratings for 'Games' genre: 22788.67
Average number of user ratings for 'News' genre: 21248.02
Average number of user ratings for 'Productivity' genre: 21028.4

On average, navigation apps have the highest number of user reviews. A practical app might have more of a chance to stand out among the huge number of apps on the App Store than a for-fun app. 

# 8) Most Popular Apps by Genre on Google Play


We have data about the number of installs for the Google Play market, so we should be able to get a clearer picture about genre popularity.

In [251]:
google_installs = display_table(google_store_final, google_store_header.index("Installs"))

1,000,000+ : 15.73 %
100,000+ : 11.55 %
10,000,000+ : 10.55 %
10,000+ : 10.2 %
1,000+ : 8.39 %
100+ : 6.92 %
5,000,000+ : 6.83 %
500,000+ : 5.56 %
50,000+ : 4.77 %
5,000+ : 4.51 %
10+ : 3.54 %
500+ : 3.25 %
50,000,000+ : 2.3 %
100,000,000+ : 2.13 %
50+ : 1.92 %
5+ : 0.79 %
1+ : 0.51 %
500,000,000+ : 0.27 %
1,000,000,000+ : 0.23 %
0+ : 0.05 %


We're going to leave the numbers as they are, which means that we'll consider that an app with 100,000+ installs has 100,000 installs, and an app with 1,000,000+ installs has 1,000,000 installs, and so on. To perform computations, however, we'll need to convert each install number from a string to a float. This means we need to remove the commas and the plus characters, or the conversion will fail and cause an error.

In [259]:
google_category = freq_table(google_store_final, google_store_header.index("Category"))

category_ave_installs = list()
for category in google_category:
    total = 0
    len_category = 0
    for row in google_store_final:
        category_app = row[google_store_header.index("Category")]
        if category == category_app:
            num_installs = row[google_store_header.index("Installs")]
            num_installs = float(num_installs.replace("+", "").replace(",", ""))
            total += num_installs
            len_category += 1
    ave_installs = total / len_category
    category_ave_installs.append((round(ave_installs,2), category))
    #print(f"The average number of user ratings for the '{category}' genre: {ave_installs:.2f}")

for avg, category in sorted(category_ave_installs, reverse=True):
    print(f"Average number of user ratings for '{category}' genre: {avg}")

Average number of user ratings for 'COMMUNICATION' genre: 38456119.17
Average number of user ratings for 'VIDEO_PLAYERS' genre: 24727872.45
Average number of user ratings for 'SOCIAL' genre: 23253652.13
Average number of user ratings for 'PHOTOGRAPHY' genre: 17840110.4
Average number of user ratings for 'PRODUCTIVITY' genre: 16787331.34
Average number of user ratings for 'GAME' genre: 15588015.6
Average number of user ratings for 'TRAVEL_AND_LOCAL' genre: 13984077.71
Average number of user ratings for 'ENTERTAINMENT' genre: 11640705.88
Average number of user ratings for 'TOOLS' genre: 10801391.3
Average number of user ratings for 'NEWS_AND_MAGAZINES' genre: 9549178.47
Average number of user ratings for 'BOOKS_AND_REFERENCE' genre: 8767811.89
Average number of user ratings for 'SHOPPING' genre: 7036877.31
Average number of user ratings for 'PERSONALIZATION' genre: 5201482.61
Average number of user ratings for 'WEATHER' genre: 5074486.2
Average number of user ratings for 'HEALTH_AND_FITN

After analyzing both stores, we think that building an app that belongs to the 'social media' category would be a good idea, since the genre 'social media' is very popular in both stores. 

# Conclusions


In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.

We concluded that building a social media app could be profitable for both the Google Play and the App Store markets. The markets are already full of social media apps, so we need to add some special features. 