Import Data

In [0]:
# imports
import io
import sys
import pandas as pd
import numpy as np
from google.colab import drive
from google.colab import files
from collections import defaultdict
from IPython.display import display

In [0]:
# mount google drive
def mount():
  drive.mount('/content/gdrive')

In [0]:
# upload file
def upload():
  uploaded = files.upload()
  return uploaded

In [0]:
# read csv file
def read_csv(uploaded):
  data = pd.read_csv(io.BytesIO(uploaded['user_launches_appAnon.csv']))
  return data

Sort Data

In [0]:
# sort data by user id and app id
def sort_data(data):
  user_sorted = data.sort_values(["userID", "appID"], ascending=[True, True])
  user_sorted.head()
  return data

Number of unique and actual list of unique values in each column

In [0]:
# get number of and actual list of unique values in each column
def unique_apps_users(user_sorted):
  user_sorted["userID"].value_counts() #10,368
  user_sorted["appID"].value_counts() # 45,788

  unique_apps = user_sorted["appID"].unique()
  unique_apps.sort()

  unique_users = user_sorted["userID"].unique()

  return unique_apps, unique_users

Get Features

In [0]:
# get features of the raw data
def show_sorted(user_sorted):
  print("User sorted features: ")
  display(user_sorted.describe())
  #display(user_sorted.boxplot())
  #display(user_sorted.plot.hist())

Create dict of users and apps

In [0]:
# iterate through each row and add each occurrence to dictionary, this will return a dict
# that has each user with the list of apps they have used
def get_user_apps(user_sorted):
  user_apps = defaultdict(list)

  for index, row in user_sorted.iterrows():
    user_apps[row["userID"]].append(row["appID"])

  user_apps = dict(user_apps)

  print("User apps: ", user_apps)
  print(" ")
  return user_apps

Create dict of apps and users

In [0]:
# iterate through each row and add each occurrence to dictionary, this will return a dict
# that has each app with the list users that have used it
# currently keys not in order
def get_app_users(user_sorted):
  app_users = defaultdict(list)

  for index, row in user_sorted.iterrows():
    app_users[row["appID"]].append(row["userID"])

  app_users = dict(app_users)

  print("App users: ", app_users)
  print(" ")
  return app_users

Code for mapping dict to dataframe

In [0]:
#create a sparse data frame showing whether each user has used an app before
def create_df(user_apps, unique_apps, unique_users):
  user_apps_DF = pd.DataFrame(columns=unique_apps, index=unique_users)
  user_apps_DF = user_apps_DF.fillna(0)

  for key, value in user_apps.items():
    for entry in value:
      user_apps_DF.at[key, entry] = 1

  return user_apps_DF

Saving Dataframe (only need when new columns are added)

In [0]:
# convert dataframe to csv 
def convert_to_csv(user_apps_DF):
  user_apps_DF.to_csv('sparse_dataframe.csv')

Get counts of each app, convert to dataframe and plot


In [0]:
# there are 10367 users that do not use app 259
# user_apps_DF[259].value_counts()[0]

In [0]:
# iterate through each app and count how many occurences there are where a 1 is in its column
def get_app_counts(user_apps_DF, unique_apps):

  app_counts = {}

  for entry in unique_apps:
    app_counts[entry] = user_apps_DF[entry].value_counts()[1]

  print("App counts: ", app_counts)
  print(" ")

  return app_counts

In [0]:
# convert app counts to a dataframe (need to change column names and reset index to make it readable)
def convert_counts_to_df(app_counts):
  app_counts_DF = pd.DataFrame(list(app_counts.items()), columns=['appID', 'count'])
  app_counts_DF.sort_values('count', ascending=False, inplace=True)
  app_counts_DF.reset_index(drop=True, inplace=True)

  print("App counts")
  display(app_counts_DF)
  print(" ")

  return app_counts_DF

In [0]:
# show app counts in various ways
def show_app_counts(app_counts_DF):
  # app_counts_DF['count'].iloc[:10].sum()
  print("App counts boxplot: ")
  display(app_counts_DF.boxplot())
  print("App counts scatter: ")
  display(app_counts_DF.iloc[:1000].plot.scatter(x='appID', y='count'))
  print("App counts bar graph: ")
  display(app_counts_DF.iloc[:50].plot.bar(x='appID', y='count'))


Group apps by count

In [0]:
# group apps together by their counts, showing how many instances there are of each count occurence
def group_apps(app_counts_DF):
  grouped_apps = app_counts_DF.groupby(['count']).count()
  grouped_apps.reset_index(inplace=True)
  grouped_apps.rename(columns={'count':'user_count', 'appID':'count'}, inplace=True)

  return grouped_apps

In [0]:
# show grouped apps
def show_grouped_apps(grouped_apps):
  #grouped_apps['count'].iloc[50:].sum()
  print("Grouped apps bar graph: ")
  display(grouped_apps.iloc[50:].plot.bar(x='user_count', y='count'))

In [0]:
# setup functions to import and save data
def setup():
  imports()
  uploaded = upload()
  data = read_csv(uploaded)

  return data

In [0]:
# main functions to sort and analyse data
def main(data):
  user_sorted = sort_data(data)
  unique_apps, unique_users = unique_apps_users(user_sorted)
  show_sorted(user_sorted)
  user_apps = get_user_apps(user_sorted)
  app_users = get_app_users(user_sorted)
  user_apps_DF = create_df(user_apps, unique_apps, unique_users)
  #convert_to_csv(user_apps_DF)
  app_counts = get_app_counts(user_apps_DF, unique_apps)
  app_counts_DF = convert_counts_to_df(app_counts)
  show_app_counts(app_counts_DF)
  grouped_apps = group_apps(app_counts_DF)

  return app_counts_DF

In [0]:
# call funtions 
data = setup()
app_counts_DF = main(data)

In [0]:
# temp location to try new things
from scipy.spatial.distance import pdist, squareform

distances = pdist(app_counts_DF.values, metric='euclidean')
dist_matrix = squareform(distances)

In [0]:
np.set_printoptions(threshold=sys.maxsize)

In [0]:
dist_matrix

Old Tester code for mapping dict to dataframe

In [0]:
# # test code with a small subset of the data
# unique_apps_temp = unique_apps[:260]
# unique_users_temp = unique_users[:500]

# user_apps_temp = {}
# i = 0

# for key, value in user_apps.items():
#   if i < 10:
#     user_apps_temp[key] = value
#     i +=1 
#   else:
#     break


# print(user_apps_temp)

In [0]:
# user_temp_DF = pd.DataFrame(columns=unique_apps_temp, index=unique_users_temp)
# user_temp_DF.fillna(0, inplace=True)

# for key, value in user_apps_temp.items():
#   for entry in value:
#     user_temp_DF.at[key, entry] = 1
