In [1]:
import pandas as pd
import numpy as np

from plmdata import Backend

In [2]:
B = Backend(False)

In [18]:
# Get data we want to use for training from the backend.

df = B.get_normalized_alsfrs_series_random_users(200)

In [20]:
# Take a look

df

Unnamed: 0,user_id,report_date,score,min_date,days_since_first_report
0,1914,722815,12,722815,0
1,1914,726516,4,722815,3701
2,1914,729212,47,722815,6397
3,1914,733287,14,722815,10472
4,1914,733321,14,722815,10506
...,...,...,...,...,...
4010,1700,737335,42,737051,284
4011,1700,737352,42,737051,301
4012,1700,737385,40,737051,334
4013,1700,737454,38,737051,403


In [24]:
# How many users report a score less than 20?

users_lt20 = df[df.score <= 20].user_id.unique()

len(users_lt20)

103

In [None]:
# What on Earth is going on with user 1914? Use cell 118 in EDA.ipynb to
# look at that user's data - they have a somewhat normal disease progression
# starting in 2010 or so, but have stray reports in 1980, 1990, and 2000. Using 
# onset date instead of first report date # would fix this issue, but would also 
# throw out a lot of data.

In [57]:
# How many report a score less than 20 in their first five reports?
# Our dataframe is already sorted by days since first report, so
# .head() gets us what we need.

# We're just going to remove these from the dataset. Under real
# circumstances, it would be correct to examine these series 
# with the onset date and decide how to handle them to minimize
# bias, but for the purpose of the challenge it's faster to just 
# remove them.

first_five_by_user = df.groupby("user_id").head()

ff_lt20 = first_five_by_user[first_five_by_user.score < 20].user_id.unique()

ff_lt20

In [59]:
# Remove those users

df = df[np.invert(np.isin(df.user_id, ff_lt20))]

In [63]:
# trim to users than have scores of 20 or less, then take another look

df = df[np.isin(df.user_id, users_lt20)]

df

Unnamed: 0,user_id,report_date,score,min_date,days_since_first_report
140,43,729975,45,729975,0
141,43,730079,36,729975,104
142,43,730133,29,729975,158
143,43,730259,24,729975,284
144,43,730504,20,729975,529
...,...,...,...,...,...
3847,4345,736809,35,736293,516
3848,4345,736861,33,736293,568
3849,4345,737072,29,736293,779
3850,4345,737128,29,736293,835


### Calculate targets

To calculate targets, we need to have either the number of days after day 0 that the user scored 20 or we need to do a linear interpolation between the user's last report score that was over 20 and the first one that was under 20. 

In [65]:
# Get each user's max score under 20

max_u20 = df[df.score<20].groupby("user_id")["score"].max()

max_u20

user_id
43      13
108     18
193     15
217     19
288     16
        ..
6813    17
7150    16
7233     9
7263    19
7344    18
Name: score, Length: 88, dtype: int64

In [67]:
# get each user's min score over 20

min_o20 = df[df.score>20].groupby("user_id")["score"].min()

min_o20

user_id
43      24
108     24
163     21
193     30
217     21
        ..
6813    24
7150    21
7233    22
7263    24
7344    21
Name: score, Length: 92, dtype: int64

In [71]:
# Get the number of users with score exactly 20. we don't need to calculate targets for them.

users_exactly20 = df[df.score == 20].user_id.unique()

len(df[df.score == 20].user_id.unique())

34

In [91]:
# Several users have several instances of scores of 20. We'll just take the first of each as a target; we want our model to predict when 
# each user will fall below a functionality score of 20.

df[df.score == 20].groupby("user_id")["score"].count()

user_id
43      1
163     1
217     1
304     1
525     2
890     1
985     1
1097    1
1171    1
1361    1
2281    1
2856    5
2917    4
3806    2
3858    2
3893    2
4236    1
4237    1
4417    1
4707    1
4941    1
4952    1
4956    1
5009    3
5081    2
5589    2
5594    2
5636    1
5820    2
6176    1
6272    1
6492    1
6813    1
7344    2
Name: score, dtype: int64

In [None]:
df

### Build training data

In [81]:
dfnpy = df.groupby("user_id").head().to_numpy()[:,[0,2,4]]

u,c = np.unique(dfnpy[:,0], return_counts=True)

len(u)

92

In [84]:
# Training data for model

X = dfnpy[:,1:].reshape((92, 5, 2))

In [86]:
X[0:5]

array([[[  45,    0],
        [  36,  104],
        [  29,  158],
        [  24,  284],
        [  20,  529]],

       [[  48,    0],
        [  48, 1517],
        [  48, 1673],
        [  48, 1884],
        [  46, 2037]],

       [[  48,    0],
        [  48,  365],
        [  48,  730],
        [  42, 1492],
        [  36, 1857]],

       [[  41,    0],
        [  35, 1150],
        [  25, 2442],
        [  31, 3925],
        [  30, 3981]],

       [[  47,    0],
        [  47,  761],
        [  44, 1796],
        [  37, 2224],
        [  35, 2283]]])