# TEMPORARY VARIABLE ASSIGNMENTS FOR DEVELOPMENT

In [14]:
#FOR TESTING
dataset = goog_data
check_var_col = 0
print('You\'re still assigning testing variables in GTJ_DatasetCleaningHelpers')

## Slice datasets based on indices

In [32]:
def remove_entries_by_indices(dataset, remove_indices):
    '''
    #Function that will take a dataset and return all entries except for those with indices listed in remove_indices
    #RETURNS:
    ##dataset - Same dataset that was an input, but WITHOUT the entries associeted with remove_indices indices
    #INPUT VARS:
    ##dataset - open()ed and reader()ed csv file (or other source providing a list of lists of rows)
    ##remove_indices - list (or anything convertable to list type) of indices to be removed from dataset
    '''
    #Check if any of the remove_indices are negative
    #if so, throw an exception
    #TODO: handle this by conversion but I don't want to right now
    if any([x < 0 for x in remove_indices]):
        raise Exception("Don't give me negative indices to remove. Rude. Maybe I'll allow this some day, but not this day.")
        
    #Check if remove_indices is a list. 
    #TODO: DONE Could combine these as list(a_list) doesn't change 'a_list'
    # if type(remove_indices) is list:
    #     #If so, make it a set
    #     remove_indices = set(remove_indices)
    #Or if it's not a list and not already a set
    #elif type(remove_indices) is not set:
    if type(remove_indices) is not set:
        remove_indices = set(list(remove_indices))
        #raise Exception("remove_indices needs to be a list or a set")
    
    #Drop entries via list comprehension
    dataset = [i for j, i in enumerate(dataset) if j not in remove_indices]
    return dataset

In [40]:
def return_entries_by_indices(dataset, return_indices):
    '''
    #Function that will take a dataset and return all entries with indices listed in return_indices
    #RETURNS:
    ##dataset - Same dataset that was an input, but with ONLY the entries associeted with return_indices indices
    #INPUT VARS:
    ##dataset - open()ed and reader()ed csv file (or other source providing a list of lists of rows)
    ##remove_indices - list (or anything convertable to list type) of indices to be returned from dataset
    '''
    #Check if any of the return_indices are negative
    #if so, throw an exception
    #TODO: handle this by conversion but I don't want to right now
    if any([x < 0 for x in return_indices]):
        raise Exception("Don't give me negative indices to return. Rude. Maybe I'll allow this some day, but not this day.")
        
    #Check if return_indices is a list. 
    #TODO: DONE Could combine these as list(a_list) doesn't change 'a_list'
    #if type(return_indices) is list:
        ##If so, make it a set
        #return_indices = set(return_indices)
    #Or if it's not a list and not already a set
    #eliftype(return_indices) is not set:
    if type(return_indices) is not set:
        return_indices = set(list(return_indices))
        #raise Exception("remove_indices needs to be a list or a set")
    
    #Keep entries via list comprehension
    dataset = [i for j, i in enumerate(dataset) if j in return_indices]
    return dataset

## Print out a slice of a dataset for inspection

In [15]:

def explore_data(dataset, start, end, rows_and_columns=False):
    '''
    #Define a useful function for exploring (displaying) a slice of a given dataset
    #RETURNS: 
    ##Nothing
    #INPUT VARS:
    ##dataset - open()ed and reader()ed csv file (or other source providing a list of lists of rows)
    ##start - First row to include in the slice (0-indexing)
    ##end - Last row to include in the slice (1-indexing because python I guess)
    ##rows_and_columns - True prints out numbers of rows/columns, False (default) doesn't
    '''
    dataset_slice = list(dataset[start:end])
    for row in dataset_slice:
        print(row)
        print('\n') # Add an empty line after each row
    
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns: ', len(dataset[0]))

## Various functions for finding and handling duplicate entries in a dataset

In [16]:

def split_duplicate_entries(dataset, check_var_col):
    '''
    #Define another useful function for detecting duplicate entries in a dataset
    #COMPARES app names to determine if entries are duplicates. Marks second and later entries with the same name as one that has already occurred as duplicates.
    #RETURNS:
    ##unique_entries - [LIST] of unique app entries' rows [CAN BE USED AS A DATASET WITH NO DUPLICATES]
    ##duplicate_entries - [LIST] of the duplicate app entries' rows
    ##duplicate_names_and_indices - [LIST] of the app names and indices of duplicate app entries in the UNIQUE dataset
    #INPUT VARS:
    ##dataset - open()ed and reader()ed csv file (or other source providing a list of lists of rows)
    ##check_var_col - Which column in the dataset corresponds to app name (or other desired duplication check variable)
    ###NOTE: 0 for goog_data, 1 for app_data
    '''
    unique_entries = [] #first and potentially only copy of an app entry
    unique_entry_indices = {} #Tracks indices of unique entries in the original AND unique datasets (in that order)
    unique_entry_names = [] #Just tracks entry names for comparison
    duplicate_entries = [] #second and further copies of app entries
    duplicate_names_and_indices = [] #Just the app name and the index of the corresponding index in unique_entries
    #duplicate_corresponding_row = [] #Copy of the row that ended up in unique_entries

    for index,app in enumerate(dataset):
        app_name = app[check_var_col]
        if app_name not in unique_entry_names:
            unique_entry_names.append(app_name)
            unique_entries.append(app)
            unique_entry_indices[app_name] = [index, len(unique_entries)-1]
        else:
            duplicate_entries.append(app)
            #duplicate_corresponding_row.append(dataset[unique_entry_indices[app_name]])
            duplicate_names_and_indices.append([app_name, unique_entry_indices[app_name]])#dataset[unique_entry_indices[app_name][0]]])
    
    return unique_entries, duplicate_entries, duplicate_names_and_indices

In [17]:

def smart_split_duplicate_entries(dataset, check_var_col, ignore_cols=[]):
    '''
    #Compares duplicate entries (matched by app name) to corresponding unique ones and returns only entries that are not whole-row matches 
    #(AKA names might be duplicates but some other element in the row doesn't line up)
    '''
    unique_entries, duplicate_entries, duplicate_names_and_indices = split_duplicate_entries(dataset, check_var_col)
    interesting_duplicates = [] #list populated with entries that were marked duplicte by name but do NOT perfectly match the corresponding unique row
    interesting_duplicate_names_and_indices = []
    for index,app in enumerate(duplicate_entries[0:5]):
        if not app == unique_entries[duplicate_names_and_indices[index][1][1]]:
            interesting_duplicates.append(app)
            #app_name = app[check_var_col]
            interesting_duplicate_names_and_indices.append(duplicate_names_and_indices[index])
            
    return unique_entries, interesting_duplicates, interesting_duplicate_names_and_indices

## This is some testing stuff/WIP below here

In [18]:
#def duplicate_entry_differences(dataset, check_var_col):
#     '''
#     #Reviews duplicate pings in the dataset, RETURNS EITHER LOCATIONS OR CONTENTS OF MISMATCHED LIST ELEMENTS


#     '''
    
unique_entries, interesting_duplicates, interesting_duplicate_names_and_indices = smart_split_duplicate_entries(dataset, check_var_col)
        

In [19]:
subU = [unique_entries[i] for i in interesting_duplicate_names_and_indices[0:5][1][1]]

IndexError: list index out of range

In [10]:
interesting_duplicate_names_and_indices[0:5]

[['Mannequin Challenge', [2949, 2949]], ['VR Roller Coaster', [4443, 4443]]]

In [62]:
explore_data(goog_data, 0, 5)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']




In [13]:
len(goog_data)

10842

In [12]:
len(dataset)

7198

In [66]:
len(goog_data_nodup)

9660

In [11]:
len(unique_entries)

7196