In [None]:
import glob
import os
import pandas as pd
import numpy as np 
from fancyimpute import IterativeImputer as MICE

# Data Loading

In [None]:
#laod all different user files related to the recoreded appevents into one dataframe
path_appevents = 'Data for thesis students/data_myphone/appevents'
csv_files_appevents = glob.glob(os.path.join(path_appevents, "*.csv"))

df_appevents = pd.DataFrame()
for f in csv_files_appevents:
    csv = pd.read_csv(f, sep = ";")
    df_appevents = pd.concat([df_appevents, csv])
    
df_appevents = df_appevents.drop(df_appevents.columns[0], axis = 1)

In [None]:
#load the dataframe with the categories and app names
df_playstore = pd.read_csv('cleaned_data/playstore.csv')
df_playstore = df_playstore.drop(df_playstore.columns[0], axis = 1)

In [None]:
#load demographics dataframe and keys for merging
df_demographics = pd.read_csv('cleaned_data/demographics.csv')
df_demographics = df_demographics.drop(df_demographics.columns[0], axis = 1)
df_key = pd.read_csv('Data for thesis students/anonymized_key.csv')
df_key = df_key.drop(df_key.columns[0], axis = 1)

alternative for categories
df_categories = pd.read_csv('cleaned_data/categories_short.csv')
df_categories = df_categories.drop(df_categories.columns[0], axis = 1)

In [None]:
df_appevents.columns

In [None]:
df_playstore.columns

In [None]:
#df_appevents.head()

# Merging with categories and demographics

In [None]:
df_appevents = df_appevents.rename({'application': 'app_id'}, axis = 1)
df_playstore = df_playstore.rename({'App Id': 'app_id', 'Category': 'category'}, axis = 1)

In [None]:
#merge the app-logs with the corresponding app names and categories 
df = pd.merge(df_appevents, df_playstore, on = ['app_id'])

In [None]:
df.columns

In [None]:
#Merge dataframe with the keys to allow for merging with the demographic data
df = df.rename(columns={'id': 'MobileDNA'})
df = df.merge(df_key[['MobileDNA', 'Ethica']], on='MobileDNA').rename(columns={'Ethica': 'EthicaID'})
df = pd.merge(df, df_demographics, on = ['EthicaID'])
df = df.rename({'Age': 'age', 'Sex': 'sex'}, axis = 1)

In [None]:
#df.head()

In [None]:
df.columns

# Data Cleaning

In [None]:
#drop duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(before - after, 'duplicate records were dropped')

#search for missing values
df.isna().sum() #age has missing value, will be imputed later 

#check the datatypes of all columns 

In [None]:
#make location values which are not useful missing
df['latitude'] = df['latitude'].replace(0 ,np.NaN)
df['longitude'] = df['longitude'].replace(0 ,np.NaN)
df.isna().sum()

In [None]:
#drop users with less than 5000 recorded events 
count = df.groupby('EthicaID')['App Name'].count().reset_index()
users = count.loc[count['App Name'] < 5000]['EthicaID']
for i in users: 
    df = df[~df.EthicaID.str.contains(i)]
len(df['EthicaID'].unique()) #186 users left 

In [None]:
df.to_csv('cleaned_data/full_cleaned_data.csv')