# Title

This is the goal of this project

## Define global methods

### openFile()
Return a list from a csv file from the provided path

In [75]:
def openFile(withPath):
    
    openedFile = open(withPath)

    from csv import reader
    readFile = reader(openedFile)
    return list(readFile)

### exploreData()
Use this method to explore the dataset and optionally print the # rows and columns.

In [98]:
def exploreData(dataset, start, end, rowsAndColumns=False):
    
    datasetSlice = dataset[start:end]    
    
    for row in datasetSlice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rowsAndColumns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

### findDuplicates()
Find duplicates in the given dataset. The method determines a duplicate based on the app name and therefore requires a dataset with app names and its index.

In [115]:
def findDuplicates(inDataset, appNameIndex):
    
    print("Finding duplicates and printing the first rows if applicable")
    
    uniqueApps = []
    duplicateApps = []

    for row in inDataset:
        appName = row[appNameIndex]

        if appName in uniqueApps:
            duplicateApps.append(row)
        else:
            uniqueApps.append(appName)
        
    
    if len(duplicateApps) > 3:
        
        for x in range(3):
            print(duplicateApps[x])
            print("---")
    else:
        
        for row in duplicateApps:
            print(row)
            print("---")
    
    print("Out of " + str(len(uniqueApps)) + " apps")
    print("We found " + str(len(duplicateApps)) + " duplicates")

### dictionaryWithAppnamesAndReviewsCount()
This methode generates a dictionary with application names as key and the total amount of reviews as value.
To do this it requires the index from the dataset for each parameter.

```
{
    appname_1_string : review_amount_x_integer,
    appname_2_string : review_amount_x_integer,
    ...
}
```

In [77]:
def dictionaryWithAppnamesAndReviewsCount(fromDataset, appNameIndex, reviewsCountIndex):
    
    dictionary = {}
    
    for row in fromDataset:
    
        appName = row[appNameIndex]
        reviewsCount = int(row[reviewsCountIndex])
        
        if appName not in dictionary:
            dictionary[appName] = reviewsCount
        elif appName in dictionary and dictionary[appName] < reviewsCount:
            dictionary[appName] = reviewsCount
            
    return dictionary

### generateCleanDataset()
Recreate a new dataset from an existing dataset. 

Internally calls dictionaryWithAppnamesAndReviewsCount() to have a reference of highest number of reviews for each app. This is used as a criteria to remove duplicate data where we keep the row with the highest count.

In [78]:
def generateCleanDataset(fromDataset, appNameIndex, reviewsCountIndex):
    
    cleanDataset = []
    alreadyAdded = []
    maxReviewDictionary = dictionaryWithAppnamesAndReviewsCount(fromDataset, appNameIndex, reviewsCountIndex)
    
    for row in fromDataset:
        
        appName = row[appNameIndex]
        reviewsCount = int(row[reviewsCountIndex])
    
        if appName not in alreadyAdded and reviewsCount == maxReviewDictionary[appName]:
            cleanDataset.append(row)
            alreadyAdded.append(appName)
            
    return cleanDataset

## Apple Store

A section exploring the Apple Store dataset. We print a few rows and and columns.

In [79]:
appleStoreData = openFile("datasets/AppleStore.csv")

### Printing the first 2 rows (without header)

In [80]:
exploreData(appleStoreData[1:], 0, 2, True)

['1', '281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']


['2', '281796108', 'Evernote - stay organized', '158578688', 'USD', '0', '161065', '26', '4', '3.5', '8.2.2', '4+', 'Productivity', '37', '5', '23', '1']


Number of rows: 7197
Number of columns: 17


### All column names for the Apple Store dataset
doc: https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps

In [81]:
for column in appleStoreData[0:1]:
    print(column)

['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


## Google Play Store

A section exploring the Google Play dataset. We print a few rows and and columns.

We also remove the duplicate applications from the dataset and recreate a new dataset with unique apps.

In [82]:
googlePlayStoreData = openFile("datasets/GooglePlayStore.csv")

### Printing the first 3 rows (without header)

In [83]:
exploreData(googlePlayStoreData[1:], 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


### All column names for the Google Play Store dataset

doc: https://www.kaggle.com/datasets/lava18/google-play-store-apps

In [84]:
for column in googlePlayStoreData[0:1]:
    print(column)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


### Incorrect row detected. Removing 10473 (with header)

In [85]:
print(googlePlayStoreData[10473])
del googlePlayStoreData[10473]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


### Search for duplicate application entries

In [116]:
findDuplicates(googlePlayStoreData[1:], 0)

Finding duplicates and printing the first rows if applicable
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
---
['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device']
---
['Google My Business', 'BUSINESS', '4.4', '70991', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 24, 2018', '2.19.0.204537701', '4.4 and up']
---
Out of 9659 apps
We found 1181 duplicates


### Example of a duplicate application - Google My Business

In [117]:
for row in googlePlayStoreData:
    appName = row[0]
    
    if appName == "Google My Business":
        print(row)

['Google My Business', 'BUSINESS', '4.4', '70991', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 24, 2018', '2.19.0.204537701', '4.4 and up']
['Google My Business', 'BUSINESS', '4.4', '70991', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 24, 2018', '2.19.0.204537701', '4.4 and up']
['Google My Business', 'BUSINESS', '4.4', '70991', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 24, 2018', '2.19.0.204537701', '4.4 and up']


### Removing the duplicate applications

Index 3 displays the review count. We will use this integer to determine which row we want to keep. The highest count should point to the most recent data.

We store our clean dataset in a new list (`cleanGoogleDataset`).

In [93]:
cleanGoogleDataset = generateCleanDataset(fromDataset=googlePlayStoreData[1:], appNameIndex=0, reviewsCountIndex=3)

Loop through the original dataset again and only append to the clean data set if

1. application is not added yet
<br/>`if name not in alreadyAdded`

2. we only use the row with the highest count for that app
<br/>`reviewsCount == reviewsMax[name]`

### Check for duplicates in the Apple Store dataset

In [118]:
findDuplicates(appleStoreData[1:], 2)

Finding duplicates and printing the first rows if applicable
['7579', '1089824278', 'VR Roller Coaster', '240964608', 'USD', '0', '67', '44', '3.5', '4', '0.81', '4+', 'Games', '38', '0', '1', '1']
---
['10885', '1178454060', 'Mannequin Challenge', '59572224', 'USD', '0', '105', '58', '4', '4.5', '1.0.1', '4+', 'Games', '38', '5', '1', '1']
---
Out of 7195 apps
We found 2 duplicates


### Removing duplicates again
We use the same criteria and select the row with the highest review count.

```
app name index = 2
reviews count index = 6
```

In [96]:
cleanAppleStoreDataset = generateCleanDataset(fromDataset=appleStoreData[1:], appNameIndex=2, reviewsCountIndex=6)

7195
