# Profitable App Profiles for the App Store and Google Play Markets

### Analyse mobile app data
#### - *The project is about....*
#### - *The goal of project is....*

### Open and explore data

In [11]:
from csv import reader
# Open and read data
def read_data(file_name, header=True):
    open_file = open(file_name, encoding='utf8')
    read_file = reader(open_file)
    dataset = list(read_file)
    if header: # The header is extracted
        return dataset[0], dataset[1:]
    else:
        return dataset

def explore_data(dataset, start, end, rows_and_columns=False):
    data_slice = dataset[start:end]
    for row in data_slice:
        print(row)
    if rows_and_columns:
        print("Summary: ")
        print("Number of rows: ", len(dataset))
        print("Number of columns: ", len(dataset[0]))

android = read_data("googleplaystore.csv") # read the googleplaystore.csv
android_header, android_file = android[0], android[1]
print(android_header)
explore_data(android_file, 0, 4, True) # 3 first rows of android_file
print("-" * 100)
ios = read_data("AppleStore.csv") # read the AppleStore.csv
ios_header, ios_file = ios[0], ios[1]
print(ios_header)
explore_data(ios_file, 0, 6, True); # 6 first rows of android_file

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']
Summary: 
Number of rows:  10841
Number of columns:  13
-----------------------------------------------------------------

### Delete Wrong Data (Data cleaning)

In [15]:
# google play csv file: row 10472 incorrect rating = 19 while maximum rating is 5. This row should be delete
print(android_header)
print(android_file[10472])
print(len(android_file))
# del android_file[10472] # must run one time
print(len(android_file))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10841
10840


### Remove duplicate value

In [16]:
# Count how many duplicate value + unique value
def duplicate_unique(file):
    duplicate = []
    unique = []
    for app in file:
        name = app[0]
        if name in unique:
            duplicate.append(name)
        else:
            unique.append(name)
    return duplicate, unique

print('Number of duplicate apps:', len(duplicate_unique(android_file)[0]))
print('\n')
print('Examples of duplicate apps:', duplicate_unique(android_file)[0][:15])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [17]:
# Create new dictionary {key=unique_app, value=highest_number_of_rating} => extract unique value with the highest number of rating
reviews_max = {}
for app in android_file:
    name = app[0] # current app
    n_reviews = float(app[3]) # current review
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews # update to current reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

print("Actual length: ", len(reviews_max))
print("Expected length: ", len(android_file) - 1181)

Actual length:  9659
Expected length:  9659


In [18]:
# Remove duplicate value on Android dataset by using reviews_max dictionary above => create new dataset from android dataset with no duplicated value and highest number of reviews
android_clean = [] # The list to contain android clean data
already_add = [] # The name of application has been added to android_clean list above

for app in android_file:
    name = app[0]
    n_reviews = float(app[android_header.index('Reviews')])
    if reviews_max[name] == n_reviews and (name not in already_add):
        android_clean.append(app)
        already_add.append(name)

print(len(android_clean))
explore_data(android_clean, 0, 3, True)

9659
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']
Summary: 
Number of rows:  9659
Number of columns:  13


### Removing non-english app

In [19]:
# Check the app name
# Check the app is English or not
# Check non-English character in the name app by ASCII range (0-127)

def is_english(string):
    for char in string:
        if ord(char) > 127:
            return False
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

True
False
False
False


In [20]:
# However, some english app name contain extra icon or symbol, so applying the above method might accidentally remove English app and lead to data loss
# Therefore we only defined a non-English app if its name contains more than 3 non-English character
def is_english(string):
    non_ascii = 0
    for char in string:
        if ord(char) > 127:
            non_ascii += 1
    if non_ascii > 3:
        return False
    else:
        return True

print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

# Now these apps are still remained in english app list

True
True


In [None]:
# Filter all english applications in both Google Play and Apple datasets
android_english = []
ios_english = []
for app in android_clean: # remember to use android_clean list (without duplicate value)
    name = app[0]
    if is_english(name):
        android_english.append(app)

for app in ios_file:
    name = app[1]
    if is_english(name):
        ios_file.append(app)

print(len(ios_english))
print(len(android_english))