# Import libraries

In [1]:
import pandas as pd
import numpy as np

# Read app_events and events

In [2]:
df_appevent = pd.read_csv('../data/app_events.csv')
df_appevent.head(3)

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0


In [3]:
df_event = pd.read_csv('../data/events.csv')
df_event.head(3)

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7


# Drop columns

In [4]:
df_appevent.drop(columns='is_installed', inplace=True)
df_appevent.head(3)

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,1
1,2,-5720078949152207372,0
2,2,-1633887856876571208,0


In [5]:
df_event.drop(columns=['timestamp', 'longitude', 'latitude'], inplace=True)
df_event.head(3)

Unnamed: 0,event_id,device_id
0,1,29182687948017175
1,2,-6401643145415154744
2,3,-4833982096941402721


# Merge dataframe

In [6]:
df_merge = df_event.merge(df_appevent, how='left', on='event_id')
df_merge.drop(columns='event_id', inplace=True)
df_merge.head()

Unnamed: 0,device_id,app_id,is_active
0,29182687948017175,,
1,-6401643145415154744,5.927333e+18,1.0
2,-6401643145415154744,-5.720079e+18,0.0
3,-6401643145415154744,-1.633888e+18,0.0
4,-6401643145415154744,-6.531843e+17,1.0


In [7]:
fact = pd.factorize(df_merge.app_id)
df_merge['app_id'] = fact[0]
app_definition = fact[1]

In [8]:
df_merge = df_merge[df_merge.is_active == 1.0]
df_merge = df_merge.drop(columns='is_active')
df_merge = df_merge.set_index('device_id')
df_merge['app_id'] = df_merge['app_id'].astype(str)
df_merge.head()

Unnamed: 0_level_0,app_id
device_id,Unnamed: 1_level_1
-6401643145415154744,0
-6401643145415154744,3
-6401643145415154744,4
-6401643145415154744,5
-6401643145415154744,9


# One-hot encoding and groupby device_id

In [16]:
df_merge = pd.get_dummies(df_merge, sparse=True).groupby('device_id').sum()
df_merge = df_merge.applymap(lambda x:1 if x!=0 else 0)
df_merge.head()

Unnamed: 0_level_0,app_id_0,app_id_1,app_id_10,app_id_100,app_id_1000,app_id_10000,app_id_10002,app_id_10003,app_id_10004,app_id_1001,...,app_id_9981,app_id_9982,app_id_9986,app_id_9989,app_id_999,app_id_9990,app_id_9993,app_id_9994,app_id_9996,app_id_9997
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222956879900151005,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9222661944218806987,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9222399302879214035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9221825537663503111,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9221767098072603291,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_merge.to_csv('../matrix_for_model/app_matrix.csv')