# Data_Science_Analyzing_The_Most_Popular_Genre
Goal: Analyze Google Play Store and Apple Store data to find out the most popular genre

In [2]:
from csv import reader
file_google = open('googleplaystore.csv')
read_file_google = reader(file_google)
android = list(read_file_google)

In [3]:
file_apple = open('AppleStore.csv')
read_file_apple = reader(file_apple)
apple = list(read_file_apple)

In [4]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

Columns of android and apple data set

In [5]:
print(android[0])
print('\n')
print(apple[0])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [6]:
explore_data(android, 1,2, True)
print('\n')
explore_data(apple, 1, 2, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 7198
Number of columns: 16


Error check: missing info row - will be deleted

In [7]:
for row in android[1:]:
    if len(row) != len(android[0]):
        print(row)
        print("\n")
        print("Error row index:", android.index(row))
        rowIndex = android.index(row)
        del android[rowIndex]
        
for row in apple[1:]:
    if len(row) != len(apple[0]):
        print(row)
        print("\n")
        print("Error row index:", apple.index(row))
        rowIndex = apple.index(row)
        del apple[rowIndex]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Error row index: 10473


Error check: duplicate rows - only the latest row will remain

In [8]:
unique_name_android = []
duplicate_name_android = []


for row in android[1:]:
    name = row[0]
    if name in unique_name_android:
        duplicate_name_android.append(name)
    else:
        unique_name_android.append(name)
        
unique_name_apple = []
duplicate_name_apple = []

for row in apple[1:]:
    name = row[1]
    if name in unique_name_android:
        duplicate_name_apple.append(name)
    else:
        unique_name_apple.append(name)

In [9]:
reviews_max_android = {}
for row in android[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if name in reviews_max_android and reviews_max_android[name] < n_reviews:
        reviews_max_android[name] = n_reviews
    elif name not in reviews_max_android:
        reviews_max_android[name] = n_reviews

android_clean = [] # data set without duplicate rows
already_added = [] # for reference

for row in android[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if n_reviews == reviews_max_android[name] and name not in already_added:
        android_clean.append(row)
        already_added.append(name)
        
        
        
reviews_max_apple = {}
for row in apple[1:]:
    name = row[1]
    n_reviews = float(row[5])
    if name in reviews_max_apple and reviews_max_apple[name] < n_reviews:
        reviews_max_apple[name] = n_reviews
    elif name not in reviews_max_apple:
        reviews_max_apple[name] = n_reviews

apple_clean = [] # data set without duplicate rows
already_added = [] # for reference

for row in apple[1:]:
    name = row[1]
    n_reviews = float(row[5])
    if n_reviews == reviews_max_apple[name] and name not in already_added:
        apple_clean.append(row)
        already_added.append(name)

Language check: remove rows with app name that has more than 3 non-english(ASCII 0

In [10]:
def englishCheck(str):
    for c in str:
        if ord(c) > 127:
            return False
    return True

android_english = []

for row in android_clean:
    if englishCheck(row[0]):
        android_english.append(row)
        
print(len(android_english))

apple_english = []

for row in apple_clean:
    if englishCheck(row[1]):
        apple_english.append(row)
        
print(len(apple_english))

9117
5705


Price check: leave only free apps

In [11]:
def priceCheck(str):
    if str == '0' or str == '0.0':
        return True
    
android_english_free = []
for row in android_english:
    if priceCheck(row[7]):
        android_english_free.append(row)

print(len(android_english_free))

apple_english_free = []
for row in apple_english:
    if priceCheck(row[4]):
        apple_english_free.append(row)

print(len(apple_english_free))

8408
2920


Genre frequency check 

Explanation: An app that is successful at both the App Store and Google Play is likely to be productive

In [12]:
android_genre_fq = {}

for row in android_english_free:
    if row[9] not in android_genre_fq:
        android_genre_fq[row[9]] = 1
    else:
        android_genre_fq[row[9]] += 1

apple_genre_fq = {}

for row in apple_english_free:
    if row[11] not in apple_genre_fq:
        apple_genre_fq[row[11]] = 1
    else:
        apple_genre_fq[row[11]] += 1

def freq_table(dataset, index):
    fq_table = {}
    total = 0
    
    for row in dataset:
        value = row[index]
        if value in fq_table:
            fq_table[value] += 1
        else:
            fq_table[value] = 1
        total += 1
    
    percentage_table = {}
    for key in fq_table:
        percentage = fq_table[key] / total * 100
        percentage_table[key] = percentage 
    
    return percentage_table

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

print('Apple prime_genre')
display_table(apple_english_free,11)  # apple prime_genre
print('\n')
print('Android genre')
display_table(android_english_free,9) # android genre
print('\n')
print('Android category')
display_table(android_english_free,1) # android category

Apple prime_genre
Games : 59.14383561643836
Entertainment : 7.534246575342466
Photo & Video : 5.136986301369863
Education : 3.8356164383561646
Social Networking : 3.116438356164384
Shopping : 2.5
Utilities : 2.26027397260274
Music : 2.1575342465753424
Sports : 2.054794520547945
Health & Fitness : 1.9863013698630139
Productivity : 1.7123287671232876
Lifestyle : 1.4726027397260275
News : 1.3356164383561644
Travel : 1.13013698630137
Finance : 1.095890410958904
Weather : 0.8904109589041096
Food & Drink : 0.8904109589041096
Reference : 0.5136986301369862
Business : 0.5136986301369862
Book : 0.273972602739726
Medical : 0.2054794520547945
Navigation : 0.136986301369863
Catalogs : 0.10273972602739725


Android genre
Tools : 8.563273073263558
Entertainment : 6.089438629876309
Education : 5.387725975261656
Business : 4.709800190294957
Productivity : 3.9724072312083734
Lifestyle : 3.8772597526165553
Finance : 3.73453853472883
Medical : 3.6393910561370126
Sports : 3.3301617507136063
Personalizatio

Apple Store average user rating per genre calculation

In [13]:
prime_genre_table = freq_table(apple_english_free,11)

for genre in prime_genre_table:
    total = 0 # sum of user ratings specific to each genre
    len_genre = 0 # number of apps specific to each genre
    for row_2 in apple_english_free:
        genre_app = row_2[11]
        if genre_app == genre:
            total += float(row_2[7])
            len_genre += 1
    print(genre)
    print(total/len_genre)
    print('\n')
    


Social Networking
3.642857142857143


Photo & Video
3.8666666666666667


Games
4.04950781702374


Music
3.9444444444444446


Reference
3.8333333333333335


Health & Fitness
3.8793103448275863


Weather
3.4615384615384617


Travel
3.409090909090909


Shopping
3.958904109589041


News
3.1794871794871793


Navigation
3.875


Lifestyle
3.4651162790697674


Entertainment
3.525


Food & Drink
3.6346153846153846


Sports
3.075


Finance
3.4375


Education
3.638392857142857


Productivity
4.05


Utilities
3.507575757575758


Book
3.3125


Business
3.8666666666666667


Catalogs
4.166666666666667


Medical
3.0




Result: Games, Productivity, Catalogs genres are recommended

Google Play average install per genre calculation

In [14]:
category_table = freq_table(android_english_free,1)

for category in category_table:
    total = 0
    len_category = 0
    for row in android_english_free:
        category_app = row[1]
        if category_app == category:
            install = row[5]
            install = install.replace(',','')
            install = install.replace('+','')
            install = float(install)
            total += install
            len_category += 1
    print(category)
    print(total/len_category)
    print('\n') 


ART_AND_DESIGN
1932519.642857143


AUTO_AND_VEHICLES
645317.2278481013


BEAUTY
513151.88679245283


BOOKS_AND_REFERENCE
8504745.97826087


BUSINESS
1602958.308080808


COMICS
880440.625


COMMUNICATION
36106662.328413285


DATING
764959.4610389611


EDUCATION
1844897.9591836734


ENTERTAINMENT
12346329.11392405


EVENTS
232885.83333333334


FINANCE
1348224.9426751593


FOOD_AND_DRINK
1974937.1386138613


HEALTH_AND_FITNESS
4263642.1749049425


HOUSE_AND_HOME
1391211.1911764706


LIBRARIES_AND_DEMO
674917.2368421053


LIFESTYLE
1375297.3058103975


GAME
15434835.816831684


FAMILY
3633707.342820999


MEDICAL
119216.81045751635


SOCIAL
24441088.17857143


SHOPPING
7307823.2010582015


PHOTOGRAPHY
18099283.85375494


SPORTS
3647640.208029197


TRAVEL_AND_LOCAL
14487541.68041237


TOOLS
11084333.292649098


PERSONALIZATION
5027006.791366907


PRODUCTIVITY
16972497.946107786


PARENTING
544745.6363636364


WEATHER
5219216.7164179105


VIDEO_PLAYERS
25234606.216216218


NEWS_AND_MAGAZINES


Result: Communication genre is recommended