In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer as MICE
from sklearn.cluster import KMeans
import seaborn as sns
import pygeohash as pgh
from matplotlib import pyplot as plt

In [None]:
#function which scales the numerical features of a dataframe
def prepare_num_features(X, cols):
    scaler = StandardScaler()  
    sca = scaler.fit_transform(X.values) 
    return pd.DataFrame(sca, columns = cols)

In [None]:
#load data
df = pd.read_csv('cleaned_data/full_cleaned_data.csv')
df = df.drop(df.columns[0], axis = 1)

In [None]:
#sort dataframe by users and time
df = df.sort_values(by = ['EthicaID', 'startTime']).reset_index(drop = True)

In [None]:
#df.head()

# Missing data imputation

In [None]:
df.isna().sum()

In [None]:
#impute missing values
trans = MICE(verbose=False)
f_complete = trans.fit_transform(df[['battery', 'sex', 'age', 'latitude', 'longitude']])
f_complete = pd.DataFrame(f_complete).set_axis(['battery', 'sex', 'age', 'latitude', 'longitude'], axis = 1)
f_complete = f_complete[['latitude', 'longitude', 'age']]
df = df.drop(['latitude', 'longitude', 'age'], axis = 1)
df = pd.concat([f_complete.reset_index(drop = True), df.reset_index(drop = True)], axis = 1)

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
#df.head()

# Time-related features

In [None]:
#convert startTime and endTime to type datetime in uct format
df['startTime'] = pd.to_datetime(df['startTime'], utc=True)
df['endTime'] = pd.to_datetime(df['endTime'], utc=True)

In [None]:
##add a new column which shows the duration of usage in minutes (later used to build target)
df['use_duration'] = df['use_duration'] = (df['endTime'] - df['startTime']).apply(lambda x: x.total_seconds())

##add new variables with the time, the hours and date
df['date'] = df['startTime'].dt.date
df['time'] = df['startTime'].dt.time
df['hours'] = df['startTime'].dt.strftime('%H')

#add the weekday with 0 being Monday and 6 being Sunday
df['weekday'] = df['startTime'].dt.dayofweek

#drop one row were the use duration is negative
df = df.drop(df.loc[df['use_duration'] < 0]['use_duration'].index)

#construct session id --> one sessions means no more than 5 minutes between two following app events
lst = []
j = 1
for i in range(len(df)): 
    if i == 0: 
        lst.append(j)
    else: 
        if df.iloc[i]['EthicaID'] != df.iloc[i-1]['EthicaID']: 
            j+=1
            lst.append(j)
        else: 
            d = df.iloc[i]['startTime'] - df.iloc[i-1]['endTime']
            d = d.total_seconds() / 60
            if d <= 5: 
                lst.append(j)
            else: 
                j+=1
                lst.append(j)
df_lst = pd.DataFrame(lst)

session_id = pd.DataFrame(lst)

session_id = session_id.rename({0: 'session_id'}, axis = 1)

In [None]:
df_lst = pd.read_csv('cleaned_data/session_id.csv')

In [None]:
df_lst = df_lst[['session_id']]

In [None]:
#build two dataframes to compute session length
df = pd.concat([df_lst, df], axis = 1).rename({0: 'session_id'}, axis = 1)
df_session_start = df[['EthicaID', 'session_id', 'startTime']].groupby(['EthicaID', 'session_id']).min().reset_index()
df_session_end = df[['EthicaID', 'session_id', 'endTime']].groupby(['EthicaID', 'session_id']).max().reset_index()

#merge them
df_session_timerange = pd.merge(df_session_start, df_session_end, on = ['EthicaID', 'session_id'])
df_session_timerange = df_session_timerange.rename(columns={'startTime': 'startTime_session', 'endTime': 'endTime_session'})

#compute the sessions duration for each row and convert it to minutes
df_session_timerange['session_duration'] = df_session_timerange['endTime_session'] - df_session_timerange['startTime_session']
df_session_timerange['session_duration'] = df_session_timerange['session_duration'].apply(lambda x: x.total_seconds())

#display(df_session_timerange.head())

#merge session length to appevent dataframe
df = pd.merge(df, df_session_timerange, on = ['EthicaID', 'session_id'])

In [None]:
#check results to detect irregularities
#df[['session_id', 'EthicaID', 'startTime', 'session_duration']].groupby(['EthicaID', 'session_id']).min().sort_values('session_duration')

In [None]:
#compute time of ongoing session
df['duration_ongoing_session'] = df['startTime'] - df['startTime_session']
df['duration_ongoing_session'] = df['duration_ongoing_session'].astype('timedelta64[s]')

In [None]:
df.columns

In [None]:
#df[['EthicaID', 'session_id', 'startTime', 'endTime', 'session_duration', 'duration_ongoing_session']]

# Location-related features

In [None]:
#transform latitude and longitude to a geohash with 5 characters
df['geohash'] = df.apply(lambda x: pgh.encode(x.latitude, x.longitude, precision=5), axis=1)

In [None]:
#build location clusters
objective_function=[] 
for i in range(1,11):
    clustering=KMeans(n_clusters=i, init='k-means++')
    clustering.fit(df[['latitude','longitude']])
    objective_function.append(clustering.inertia_)

In [None]:
plt.plot(range(1,11),objective_function)
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters K')
plt.ylabel('objective_function')
#plt.show()

In [None]:
kmeans = KMeans(8)
clusters = kmeans.fit_predict(df[['latitude','longitude']])
df['location_cluster'] = kmeans.predict(df[['latitude','longitude']])

In [None]:
df.plot.scatter(x='latitude',
                y='longitude',
                c='location_cluster',
                colormap='cool_r')

In [None]:
#df.head()

# App-history related features

In [None]:
#compute time to next app and include a column which contains the category of the next opend app
user_length = {}
average_per_day = {}
average_per_hour = {}
average_per_session = {}
df_lst = []

for user in np.unique(df['EthicaID']): 
    df_single_user = df.loc[df['EthicaID'] == user] 
    d = (pd.to_datetime(df_single_user['startTime'])[1:].reset_index(drop = True) - pd.to_datetime(df_single_user['endTime'])[:len(df_single_user)-1].reset_index(drop = True)).apply(lambda x: x.total_seconds())
    
    null = pd.DataFrame([0])
    d = pd.DataFrame(d)
    d = pd.concat([null, d], axis = 0).reset_index(drop = True)
    #display(d)
    df_single_user = pd.concat([df_single_user.reset_index(drop = True), d], axis = 1)
    prev_cat = df_single_user[['category']].rename({'category': 't+1 category'}, axis = 1)[1:len(df_single_user)]
    df_single_user = pd.concat([df_single_user, prev_cat.dropna().reset_index(drop = True)], axis =1).rename({0: 'time_to_next_app'}, axis = 1)
    
    to_next_next_app = df_single_user[['time_to_next_app']].rename({'time_to_next_app': 'time_to_next_app_t+1'}, axis = 1)[1:len(df_single_user)]
    df_single_user = pd.concat([df_single_user, to_next_next_app.dropna().reset_index(drop = True)], axis =1)
    
    #display(df_single_user)
    
    df_lst.append(df_single_user)
    
    user_length[user] = user_length.get(user, len(df_single_user))
    
    per_day = round(np.average(df_single_user.groupby('date').count()['EthicaID'].values))
    average_per_day[user] = average_per_day.get(user, per_day)
    per_hour =  round(np.average(df_single_user.groupby(['date', 'hours']).count()['EthicaID'].values)) 
    average_per_hour[user] = average_per_hour.get(user, per_hour)
    per_session = round(np.average(df_single_user.groupby(['session_id']).count()['EthicaID'].values)) 
    average_per_session[user] = average_per_session.get(user, per_session)

In [None]:
sum(average_per_day.values()) / len(average_per_day)

In [None]:
sum(average_per_session.values()) / len(average_per_session)

In [None]:
sum(average_per_hour.values()) / len(average_per_hour)

In [None]:
df_full = pd.concat(df_lst, axis = 0)
#df_full['t+1 category'] = df_full['t+1 category'].fillna(df_full['category'].value_counts().reset_index()['index'][0])

In [None]:
df_full['age_category'] = pd.qcut(df_full['age'], q=4, labels= ['17.749, 19.25', '19.25, 20.083', '20.083, 21.75', '21.75, 41.333'])

In [None]:
#df_full[['category', 't+1 category', 'time_to_next_app', 'time_to_next_app_t+1', 'startTime', 'endTime']]

# target

In [None]:
#compute use duration quantiles per category to create target variable
duration_quantiles_33 = df_full[['category', 'use_duration']].groupby('category').quantile(q = 0.33)
duration_quantiles_33 = duration_quantiles_33.reset_index().rename({'use_duration' : '33%'}, axis = 1)
duration_quantiles_67 = df_full[['category', 'use_duration']].groupby('category').quantile(q = 0.67)
duration_quantiles_67 = duration_quantiles_67.reset_index().rename({'use_duration' : '67%'}, axis = 1)

In [None]:
#merge both dataframes
duration_quantiles = duration_quantiles_33.merge(duration_quantiles_67, on = 'category')

In [None]:
duration_quantiles.head()

In [None]:
#search for the indexes to the corresponding categories 
index_short = []
for i in range(len(duration_quantiles)): 
    name = duration_quantiles.loc[i][0]
    array = df[(df.category == name) & (df.use_duration <= duration_quantiles.loc[i]['33%'] )].index
    index_short.extend(array.tolist())
    
index_long= []
for i in range(len(duration_quantiles)): 
    name = duration_quantiles.loc[i][0]
    array = df[(df.category == name) & (df.use_duration >= duration_quantiles.loc[i]['67%'])].index
    index_long.extend(array.tolist()) 

In [None]:
#create dataframe with targets
df_short = pd.DataFrame(index_short)
df_short['target'] = 'short'

df_long = pd.DataFrame(index_long)
df_long['target'] = 'long'

df_target = pd.concat([df_short, df_long], axis = 0)
df_target = df_target.sort_values([0]).set_index([0])

In [None]:
df_target.head()

In [None]:
df_full[['use_duration', 'category','target']].head()

In [None]:
#join it to the original dataframe and impute the missing category
df_full = pd.concat([df_full.reset_index(drop = True), df_target], axis = 1)
df_full['target'] = df_full['target'].fillna('medium')

In [None]:
#df_full.head()

In [None]:
df_full.to_csv('cleaned_data/data_for_eda.csv')

# Scaling of numerical features

In [None]:
#scale numerical features and put them together with needed other variables
df_num = df_full[['use_duration', 'duration_ongoing_session', 'time_to_next_app',  'time_to_next_app_t+1']]
df_num_sca = prepare_num_features(df_num,['use_duration', 'duration_ongoing_session', 'time_to_next_app',  'time_to_next_app_t+1'])
df_cat = df_full.drop(['use_duration', 'duration_ongoing_session', 'time_to_next_app',  'time_to_next_app_t+1'], axis = 1)

In [None]:
df_full = pd.concat([df_cat.reset_index(drop = True), df_num_sca.reset_index(drop = True)], axis = 1)

In [None]:
df_full.columns

In [None]:
#df_full.head()

In [None]:
df_reduced = df_full[['EthicaID', 'use_duration', 'duration_ongoing_session', 'time_to_next_app',  'time_to_next_app_t+1',
                      'age_category','sex', 
                      'notification', 'battery','category','t+1 category',
                      'hours', 'weekday', 'geohash', 'location_cluster', 'target']]

In [None]:
#df_reduced.head()

In [None]:
df_reduced.to_csv('cleaned_data/data_with_features.csv')

In [None]:
session_id.to_csv('cleaned_data/session_id.csv')