# Data Science Project

The project is analysing user's engagement with free apps launched on PlayStore and Apple Store. The goal of the project is to better understand the revenue stream and how to modify products to further the desireable results

In [1]:
from csv import *

def extract_dataset(filename, has_header=False):
    opened_file = open(filename)
    read_file = reader(opened_file)
    data_set = list(read_file)
    
    if has_header:
        return data_set[0], data_set[1:]
    return data_set[1:]

Apple_data_set = extract_dataset('AppleStore.csv')
Google_data_set = extract_dataset('googleplaystore.csv')

In [2]:
a = extract_dataset('AppleStore.csv', True)
b = extract_dataset('googleplaystore.csv', True)

print(a[0])
print("\n")

print(b[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [3]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print("This is Apple Data ------------------------------")
explore_data(Apple_data_set, 0, 3, rows_and_columns=True)

print("\n")
print("This is Google Data ------------------------------")
explore_data(Google_data_set, 0, 3, rows_and_columns=True)

This is Apple Data ------------------------------
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


This is Google Data ------------------------------
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_A

In [4]:
print(Google_data_set[10472])
print(len(Google_data_set[10472]))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
12


In [5]:
del Google_data_set[10472] ## deleting the row with error
# Warning! don't run this cell

In [6]:
print(Google_data_set[10472]) # printing the new row
print(len(Google_data_set[10472])) ## confirming it is free of error

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']
13


In [7]:
android_unique_apps = []
android_duplicate_apps = []

for row in Google_data_set:
    name = row[0]
    
    if name not in android_unique_apps:
        android_unique_apps.append(name)
    else:
        android_duplicate_apps.append(name)
        
print("No. of duplicate apps in android: "+ str(len(android_duplicate_apps)))

No. of duplicate apps in android: 1181


There are 1181 duplicate entries in Google Data Set. This is possibly because data were collected in different times.

# Remove Duplicate Data
I'd remove the duplicates based on number of reviews for each entry. The entry with highest reviews will be kept since it suggests the most recent collection of the data and rest will be removed

In [8]:
reviews_max = {}

for row in Google_data_set:
    name = row[0]
    n_reviews = float(row[3])
    
    if name in reviews_max:
        if reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
    else:
        reviews_max[name] = n_reviews
        
print(len(reviews_max))

9659


In [9]:
android_clean = []
already_added = []

for row in Google_data_set:
    name = row[0]
    n_reviews = float(row[3])
    
    if n_reviews == reviews_max[name] and (name not in already_added):
        # The second condition is needed in case there are multiple
        # entries of apps with maximum reviews
        android_clean.append(row)
        already_added.append(name)

print("-----Android data set without any duplicate-----" + "\n")
explore_data(android_clean,0,5,rows_and_columns=True)

-----Android data set without any duplicate-----

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


Number of rows: 9659
Number of columns: 1

# Remove Non-English Apps
Next, I will remove all the Non-english app from the data set.

In [10]:
def isEnglish(a_string):
    non_english_counter = 0
    for s in a_string:
        if non_english_counter > 3:
            return False
        
        if ord(s) > 127:
            non_english_counter += 1
            
    return True

#Testing the Above function
print(isEnglish('Instagram'))
print(isEnglish('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(isEnglish('Docs To Go™ Free Office Suite'))
print(isEnglish('Instachat 😜'))

True
False
True
True


In [11]:
def filter_English_Apps(data_set, index_of_name):
    filtered = []
    
    for row in data_set:
        name = row[index_of_name]
        if isEnglish(name):
            filtered.append(row)
    
    return filtered

android_english_apps = filter_English_Apps(android_clean, 0)
ios_english_apps = filter_English_Apps(Apple_data_set, 1)

print("-----Android Apps-----" + "\n")
explore_data(android_english_apps,0,3,True)
print("\n" + "-----Ios Apps-----" + "\n")
explore_data(ios_english_apps,0,3,True)

-----Android Apps-----

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9616
Number of columns: 13

-----Ios Apps-----

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '21308

## Collecting only free apps in data set

In [23]:
def filter_freeApps(data_set, index_of_price):
    filtered = []

    for row in data_set:
        price = row[index_of_price]
        if price == '0' or price == '0.0':
            filtered.append(row)

    return filtered

Free_English_android_apps = filter_freeApps(android_english_apps, 7)
Free_English_ios_apps = filter_freeApps(ios_english_apps, 4)

explore_data(Free_English_android_apps,0,3,True)
print("\n")
explore_data(Free_English_ios_apps,0,3,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8866
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 

 To minimize cost we need our app to work in Google and preferably in App Store too. More the traffic higher is the revenue attracted.

In [36]:
def create_frequency_table(data_set, index):
    table = {}
    display_table = []
    for row in data_set:
        key = row[index]
        
        if key in table:
            table[key] += 1
        else:
            table[key] = 1

    
    return table

def display_proportion_of_apps(data_set, index):
    table = create_frequency_table(data_set, index)
    display_table = []
    for key in table:
        proportion = table[key] / len(data_set)
        value = round(proportion * 100, 2)
        display_table.append((key, value))
        
    display_table.sort(key=lambda x:x[1], reverse=True)
    
    for row in display_table:
        print(row[0] + " : " + str(row[1]))
        
print("Distribution of Apps based on different categories in App Store","\n")       
display_proportion_of_apps(Free_English_ios_apps, 11)

Distribution of Apps based on different categories in App Store 

Games : 57.85
Entertainment : 7.93
Photo & Video : 4.92
Education : 3.63
Social Networking : 3.26
Shopping : 2.67
Utilities : 2.52
Sports : 2.12
Music : 2.03
Health & Fitness : 2.0
Productivity : 1.75
Lifestyle : 1.6
News : 1.32
Travel : 1.29
Finance : 1.26
Weather : 0.89
Food & Drink : 0.89
Reference : 0.55
Business : 0.55
Book : 0.46
Navigation : 0.18
Medical : 0.18
Catalogs : 0.12


In [37]:
print("Distribution of Apps based on different categories in Play Store","\n")       
display_proportion_of_apps(Free_English_android_apps, 1)

Distribution of Apps based on different categories in Play Store 

FAMILY : 18.9
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.91
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.39
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.33
SHOPPING : 2.24
BOOKS_AND_REFERENCE : 2.15
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.92
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


In [41]:
genre_freq = create_frequency_table(Free_English_ios_apps, -5)
for genre in genre_freq:
    total = 0
    for row in Free_English_ios_apps:
        genre_app = row[-5]
        
        if genre == genre_app:
            total += float(row[5])

    average = total / genre_freq[genre]
    print(genre + ":" + str(average))

Social Networking:71548.34905660378
Photo & Video:28441.54375
Games:22691.801806588734
Music:57326.530303030304
Reference:74942.11111111111
Health & Fitness:23298.015384615384
Weather:50477.137931034486
Utilities:18460.353658536584
Travel:26925.166666666668
Shopping:25996.32183908046
News:21248.023255813954
Navigation:86090.33333333333
Lifestyle:16168.73076923077
Entertainment:13831.282945736433
Food & Drink:29885.758620689656
Sports:23008.898550724636
Book:37217.73333333333
Finance:27638.243902439026
Education:7003.983050847458
Productivity:20702.19298245614
Business:7075.333333333333
Catalogs:4004.0
Medical:612.0


In [55]:
category_frequency = create_frequency_table(Free_English_android_apps,1)


for category in category_frequency:
    total = 0
    for row in Free_English_android_apps:
        category_app = row[1]
        
        if category == category_app:
            n_installs = float(row[5].replace('+', '').replace(',', ''))
            total += round(n_installs,
            
    average = total / category_frequency[category]
    print(category, " : ", str(average))

ART_AND_DESIGN  :  1986335.0877192982
AUTO_AND_VEHICLES  :  647317.8170731707
BEAUTY  :  513151.88679245283
BOOKS_AND_REFERENCE  :  8721959.47643979
BUSINESS  :  1712290.1474201474
COMICS  :  817657.2727272727
COMMUNICATION  :  38456119.167247385
DATING  :  854028.8303030303
EDUCATION  :  1833495.145631068
ENTERTAINMENT  :  11640705.88235294
EVENTS  :  253542.22222222222
FINANCE  :  1387692.475609756
FOOD_AND_DRINK  :  1924897.7363636363
HEALTH_AND_FITNESS  :  4188821.9853479853
HOUSE_AND_HOME  :  1331540.5616438356
LIBRARIES_AND_DEMO  :  638503.734939759
LIFESTYLE  :  1433701.5244956773
GAME  :  15588015.603248259
FAMILY  :  3695641.8198090694
MEDICAL  :  120550.61980830671
SOCIAL  :  23253652.127118643
SHOPPING  :  7036877.311557789
PHOTOGRAPHY  :  17840110.40229885
SPORTS  :  3638640.1428571427
TRAVEL_AND_LOCAL  :  13984077.710144928
TOOLS  :  10801391.298666667
PERSONALIZATION  :  5201482.6122448975
PRODUCTIVITY  :  16787331.344927534
PARENTING  :  542603.6206896552
WEATHER  :  507