Open Apple Store datasets and prepare the lists:

In [1]:
from csv import reader
opened_file=open('AppleStore.csv', encoding='utf8')
read_file=reader(opened_file)
ios=list(read_file)
ios_header=ios[0]
ios=ios[1:]

Open GooglePlay datasets and prepare the lists:

In [2]:
opened_file=open('Googleplaystore.csv', encoding='utf8')
read_file=reader(opened_file)
google=list(read_file)
google_header=google[0]
google=google[1:]

Function to rapidly explore a dataset:

In [3]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

Let's look at Apple dataset header:

In [4]:
ios_header

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

Let's take a look first five rows of the actual data from IOS dataset:

In [5]:
explore_data(ios,0,5,True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


Let's do the same for Google dataset:

In [6]:
google_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [7]:
explore_data(google,0,5,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 10841
Number of columns: 13


We now know that Apple dataset contains:
Number of rows: 7197
Number of columns: 16

While GooglePlay dataset contains:
Number of rows: 10841
Number of columns: 13

We can now loop through each dataset to determine if there are any empty columns:

In [8]:
ios_hl=len(ios_header)
google_hl=len(google_header)

for row in ios:
    ios_rl=len(row)
    if ios_rl != ios_hl:
        print(row)
        print(ios.index(row))
        
for row in google:
    google_rl=len(row)
    if google_rl != google_hl:
        print(row) 
        print(google.index(row))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472


We can see that google data set has a row with a missing column with index 10472.
We can either remove this record or search for a missing value. 

In [9]:
del google[10472]

Let's verify that the record is deleted:

In [10]:
for row in google:
    google_rl=len(row)
    if google_rl != google_hl:
        print(row) 
        print(google.index(row))

Let's look for duplicates in each dataset:

Now let's write a function that will check for duplicates in an assinged database:

In [11]:
def check_unique(dataset, location):
    unique=[]
    dupl=[]
    count=0
    app_index=0
    for row in dataset:
        app_name=row[location]
        
        if app_name in unique:        
            dupl.append(app_name)
#             app_index=dataset.index(row)
#             del dataset[app_index]
            count+=1
            
        else:
            unique.append(app_name)
            
    print('Number of unique apps is ' + str(len(unique)))
    print('Number of duplicate apps is ' + str(len(dupl)))
    print(count)
    return(dupl)

google_duplicated=check_unique(google,0)
        


Number of unique apps is 9659
Number of duplicate apps is 1181
1181


In [12]:
print(google_duplicated[0:5])

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


In [13]:
apple_dupl=check_unique(ios,0)

Number of unique apps is 7197
Number of duplicate apps is 0
0


Let's find a way to remove all duplicates in the Google Play dataset. To do this, we could modify our ``` check_unique``` function:

In [14]:
def check_unique_delete(dataset, location):
    unique=[]
    dupl=[]
    count=0
    app_index=0
    for row in dataset:
        app_name=row[location]
        
        if app_name in unique:        
            dupl.append(app_name)
            app_index=dataset.index(row)
            del dataset[app_index]
            count+=1
            
        else:
            unique.append(app_name)
            
    print('Number of unique apps is ' + str(len(unique)))
    print('Number of duplicate apps is ' + str(len(dupl)))
    print(count)
#     return(dupl)

In [15]:
for app in google:
    name = app[0]
    if name == 'ZOOM Cloud Meetings':
        print(app)

['ZOOM Cloud Meetings', 'BUSINESS', '4.4', '31614', '37M', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 20, 2018', '4.1.28165.0716', '4.0 and up']
['ZOOM Cloud Meetings', 'BUSINESS', '4.4', '31614', '37M', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 20, 2018', '4.1.28165.0716', '4.0 and up']


Let's identify all unique apps with highest amount of ratings(column 4)
1. Apps that are dublicates will be compared by their column 4
2. Only the apps with highest amount of ratings will be retained and placed in ```review_max``` dictionary

In [16]:
review_max={}

for app in google:
    name = app[0]
    n_reviews=float(app[3])
    
    if name in review_max and review_max[name] < n_reviews:
        review_max[name] = n_reviews
        
    elif name not in review_max:
        review_max[name] = n_reviews
        

In [17]:
print(len(review_max))

9659


We now need to remove all duplicates with ratings not greater than those in ```review_max``` dictionary.

To do this, we'll first create two empty lists `google_clean` which will hold google dataset with only unique entries, and ```already_added``` to test if we already added an app to ```google_clean```. If an app is already added to ```google_clean```, we'll skip the current iteration of the for loop. 

This way we will loop through entire dataset while matching it to the unique app names in our dictionary. 

In [18]:
google_clean=[]
already_added=[]

for app in google:
    name=app[0]
    n_reviews=float(app[3])
    
    if (review_max[name] == n_reviews) and (name not in already_added):
        google_clean.append(app)
        already_added.append(name)
        

In [19]:
google_clean[0]

['Photo Editor & Candy Camera & Grid & ScrapBook',
 'ART_AND_DESIGN',
 '4.1',
 '159',
 '19M',
 '10,000+',
 'Free',
 '0',
 'Everyone',
 'Art & Design',
 'January 7, 2018',
 '1.0.0',
 '4.0.3 and up']

In [20]:
print(ios_header)
print('\n')
print(ios[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


In [21]:
review_max={}

for app in ios:
    name = app[1]
    n_reviews=float(app[5])
    
    if name in review_max and review_max[name] < n_reviews:
        review_max[name] = n_reviews
        
    elif name not in review_max:
        review_max[name] = n_reviews
        
print(len(review_max))

7195


In [22]:
ios_clean=[]
already_added=[]

for app in ios:
    name=app[1]
    n_reviews=float(app[5])
    
    if (review_max[name] == n_reviews) and (name not in already_added):
        ios_clean.append(app)
        already_added.append(name)

In [23]:
print(len(ios_clean))
print(ios_clean[0:2])

7195
[['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']]


***So far we have success!!!***

Now let's scan the datasets to make sure we are only capturing english apps(those apps whose name is not written in English will be deleted). 

Let's create a function that check for characters that have ASCII values of greater than 127
Those less than or equal to 127 are part of English

In [24]:
def en_check(string):
    non_ASCII=0
    for each in string:
        char=ord(each)
        if char > 127:
            non_ASCII+=1
            
    if non_ASCII>2:
        return False
    else:
        return True
            

In [28]:
google_foreign=[]
google_english=[]

for row in google_clean:
    name=row[0]
    if en_check(name) == True:
        google_english.append(name)
    else:
        google_foreign.append(name)
        
ios_foreign=[]
ios_english=[]

for row in ios_clean:
    name=row[1]
    if en_check(name) == True:
        ios_english.append(name)
    else:
        ios_foreign.append(name)

In [29]:
print('Number of English apps in Google App is ' + str(len(google_english)))
print('\n')
print('Number of Foreign apps in Google App is ' + str(len(google_foreign)))

Number of English apps in Google App is 9597


Number of Foreign apps in Google App is 62


Repeat above steps for IOS apps:


In [30]:
print('Number of English apps in IOS App is ' + str(len(ios_english)))
print('\n')
print('Number of Foreign apps in IOS App is ' + str(len(ios_foreign)))

Number of English apps in IOS App is 6153


Number of Foreign apps in IOS App is 1042


Let's find and isolate all free apps from non-free apps. 