# Analyzing data from google play store



Definiton of the class wich will get the data set and return it with all data filtered for the analysis


In [132]:
from csv import reader

class AppData:

    #The __init__ method takes an open file object as its argument and initializes several attributes, 
    # including the android_data attribute, which is a list containing the data from the file, and other methods
    #  of the class like clean_data(), remove_duplicates(), filter_if_is_english(), and free_apps(), which clean and filter the data.
    def __init__(self, open_file):
        # The Google Play data set
        opened_file = open(open_file, encoding="utf8")
        read_file = reader(opened_file)
        self.android_data = list(read_file)
        self.android_header = self.android_data[0]
        self.android_data = self.android_data[1:]

        self.clean_data()
        self.remove_duplicates()
        self.filter_if_is_english()
        self.free_apps()
        
    #The explore_data method prints a slice of the android_data list and prints the number of rows and columns in the data, if specified.
    def explore_data(self, start, end, rows_and_columns=False):
        dataset_slice = self.android_data[start:end]    
        for row in dataset_slice:
            print(row)
            print('\n') # adds a new (empty) line between rows

        if rows_and_columns:
            print('Number of rows:', len(self.android_data))
            print('Number of columns:', len(self.android_data[0]))
    
    #The delete method removes a row of data at a given index.
    def delete(self, index):
        del self.android_data[index]

    #The clean_data method deletes rows with a rating greater than 5.
    def clean_data(self):
        i = 0
        while i < len(self.android_data):
            rating = float(self.android_data[i][2])
            if rating > 5:
                self.delete(i)
            else:
                i += 1
                continue
            i += 1

    #The remove_duplicates method removes duplicate rows based on the number of reviews.
    def remove_duplicates(self):
        reviews_max = {}
        for app in self.android_data:
            name = app[0]
            n_reviews = float(app[3])
            
            if name in reviews_max and reviews_max[name] < n_reviews:
                reviews_max[name] = n_reviews
            elif name not in reviews_max:
                reviews_max[name] = n_reviews

        android_clean = []
        already_added = []
        for app in self.android_data:
            name = app[0]
            n_reviews = float(app[3])
            if (reviews_max[name] == n_reviews) and (name not in already_added):
                android_clean.append(app)
                already_added.append(name) 
        self.android_data = android_clean

    #The is_english method checks if a string has less than 3 non-ASCII characters.
    def is_english(self,string):
        non_ascii = 0
        
        for character in string:
            if ord(character) > 127:
                non_ascii += 1
        
        if non_ascii > 3:
            return False
        else:
            return True
    #The filter_if_is_english method filters the data to only include rows with an English name.
    def filter_if_is_english(self):
        android_english = []
        ios_english = []

        for app in self.android_data:
            name = app[0]
            if self.is_english(name):
                android_english.append(app)

        self.android_data = android_english

    #The free_apps method filters the data to only include rows with a price of "0" (free).
    def free_apps(self):
        android_final = []
    
        for app in self.android_data:
            price = app[7]
            if price == '0':
                android_final.append(app)
        
        self.android_data = android_final

    #The get_android_data method returns the filtered and cleaned data for further analysis.
    def get_android_data(self):
        return self.android_data

In [133]:
app_data = AppData('googleplaystore.csv')
print(app_data)
app_data.explore_data(0,4,True)

<__main__.AppData object at 0x000001D39B1E2BD0>
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 8864
Number of columns: 13


In [134]:
def freq_table(dataset, index):
    """
    Calculates the frequency table for a specified column in a dataset.
    
    Parameters:
    dataset (list of lists): The dataset to generate a frequency table for.
    index (int): The index of the column to generate the frequency table for.
    
    Returns:
    dict: A dictionary containing the frequency counts for each unique value in the specified column,
          as well as the percentage of total rows that each value represents.
    """
    # Create an empty dictionary to store the frequency counts
    table = {}
    # Initialize a variable to keep track of the total number of rows in the dataset
    total = 0
    
    # Loop through each row in the dataset
    for row in dataset:
        # Increment the total row count
        total += 1
        # Extract the value in the specified column for the current row
        value = row[index]
        # Update the frequency count for that value in the table dictionary
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    # Create a new dictionary to store the percentage of total rows that each value represents
    table_percentages = {}
    # Loop through each key (i.e., unique value) in the table dictionary
    for key in table:
        # Calculate the percentage of total rows that the current value represents
        percentage = (table[key] / total) * 100
        # Store the percentage in the table_percentages dictionary
        table_percentages[key] = percentage 
    
    # Return the table_percentages dictionary
    return table_percentages


In [135]:
def display_table(dataset, index):
    '''
    Generates a frequency table for a specified column in a dataset and displays it in descending order.
    
    Parameters:
    dataset (list): A list of lists representing the dataset.
    index (int): The index of the column to generate the frequency table for.
    
    Returns:
    None
    '''
    
    # Generate the frequency table
    table = freq_table(dataset, index)
    
    # Convert the table into a list of tuples for sorting and display
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    # Sort the table in descending order and display it
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])


here we use the get_android_data() method to save the data on the android_data variable, so we can display it

In [136]:
android_data = app_data.get_android_data()

#display_table(android_data, 1)
print('\n')
display_table(android_data, -4)  



Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.92509025270

here we adjust the categories data, so we can display it

In [137]:
categories_android = freq_table(android_data, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_data:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3695641.8198090694
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_

displaying some data

In [138]:
for app in android_data:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
imo beta free calls and text : 100,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Messenger – Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Skype - free IM & video calls : 1,000,000,000+
Who : 100,000,000+
GO SMS Pro - Messenger, Free Themes, Emoji : 100,000,000+
LINE: Free Calls & Messages : 500,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Firefox Browser fast & private : 100,000,000+
UC Browser - Fast Download Private & Secure : 500,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Messenger Lite: Free Calls & Messages : 100,000,000+
Kik : 100,000,000+
KakaoTalk: Free Calls & Text : 100,000,000+
Opera Mini - fast web browser : 100,000,000+
Opera Browser: Fast and Secure : 100,000,000+
Telegram : 100,000,000+
Truecaller: Caller ID, SMS spam blocking & Dialer : 100,000,000+
UC Browser Mini -Tiny Fast Private & Secure : 100,000,000+
Viber Mess