In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14, 8

import os
import re

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.misc
import matplotlib


from sklearn.preprocessing import StandardScaler, LabelEncoder
from statsmodels.formula.api import quantreg

## Using Multi-Linear Regression:

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train['test_value'] = 0
test['test_value'] = 1

train = pd.concat( (train, test) )
train.sort_values(['Patient', 'Weeks'], inplace = True)

train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,test_value
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,0
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,0
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,0
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,0
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,0


## Re-formatting Test and Submission dataframes:

In [4]:
submission['Patient'] = submission['Patient_Week'].map(lambda x: x.split('_')[0])
submission['Weeks'] = submission['Patient_Week'].map(lambda x: x.split('_')[1])

In [5]:
def encode_and_scale(df):
    le = LabelEncoder()
    df['Sex'] = le.fit_transform(df['Sex'])
    df['SmokingStatus'] = le.fit_transform(df['SmokingStatus'])
    sc = StandardScaler()
    train[['Percent', 'Age', 'Sex', 'SmokingStatus']] = sc.fit_transform(train[['Percent', 'Age', 'Sex', 'SmokingStatus']])
    return df

train = encode_and_scale(train)
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,test_value
0,ID00007637202177411956430,-4,2315,-0.98114,1.673414,0.51424,-0.423715,0
1,ID00007637202177411956430,5,2214,-1.109586,1.673414,0.51424,-0.423715,0
2,ID00007637202177411956430,7,2061,-1.304161,1.673414,0.51424,-0.423715,0
3,ID00007637202177411956430,9,2144,-1.198607,1.673414,0.51424,-0.423715,0
4,ID00007637202177411956430,11,2069,-1.293987,1.673414,0.51424,-0.423715,0


In [7]:
# Modelled using statsmodels.formula.api.quant_reg:
modelL = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.25)
model = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.5)
modelH = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.75)
model.summary()



0,1,2,3
Dep. Variable:,FVC,Pseudo R-squared:,0.609
Model:,QuantReg,Bandwidth:,121.5
Method:,Least Squares,Sparsity:,654.2
Date:,"Mon, 07 Sep 2020",No. Observations:,1554.0
Time:,09:52:44,Df Residuals:,1548.0
,,Df Model:,5.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2723.9011,14.105,193.113,0.000,2696.234,2751.568
Weeks,-0.8102,0.359,-2.258,0.024,-1.514,-0.106
Percent,641.7857,8.535,75.191,0.000,625.043,658.528
Age,-115.8960,8.363,-13.858,0.000,-132.300,-99.491
Sex,429.4419,8.955,47.955,0.000,411.876,447.007
SmokingStatus,20.4373,8.925,2.290,0.022,2.932,37.943


In [8]:
train['y_predL'] = modelL.predict(train).values
train['y_pred'] = model.predict(train).values
train['y_predH'] = modelH.predict(train).values

#Taking Quartile Deviation to measure confidence:
# train['predSTD'] = 1/2 * np.abs(train['y_predH'], train['y_predL'])

train['predStd'] = 1/2 * (train['y_predH'] - train['y_predL'])
train.head(10)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,test_value,y_predL,y_pred,y_predH,predStd
0,ID00007637202177411956430,-4,2315,-0.98114,1.673414,0.51424,-0.423715,0,1929.933801,2115.694765,2268.520495,169.293347
1,ID00007637202177411956430,5,2214,-1.109586,1.673414,0.51424,-0.423715,0,1849.393889,2025.968534,2181.002372,165.804242
2,ID00007637202177411956430,7,2061,-1.304161,1.673414,0.51424,-0.423715,0,1732.496086,1899.472326,2047.633026,157.56847
3,ID00007637202177411956430,9,2144,-1.198607,1.673414,0.51424,-0.423715,0,1794.556661,1965.594982,2120.193839,162.818589
4,ID00007637202177411956430,11,2069,-1.293987,1.673414,0.51424,-0.423715,0,1736.806119,1902.760941,2054.886156,159.040019
5,ID00007637202177411956430,17,2101,-1.253292,1.673414,0.51424,-0.423715,0,1758.437133,1924.017495,2083.217559,162.390213
6,ID00007637202177411956430,29,2000,-1.381737,1.673414,0.51424,-0.423715,0,1676.579955,1831.860636,1995.903771,159.661908
7,ID00007637202177411956430,41,2064,-1.300346,1.673414,0.51424,-0.423715,0,1719.841983,1874.373745,2052.566578,166.362298
8,ID00007637202177411956430,57,2057,-1.309248,1.673414,0.51424,-0.423715,0,1707.508485,1855.69712,2047.548267,170.019891
9,ID00009637202177434476278,8,3660,0.384884,0.254864,0.51424,-0.423715,0,2933.744104,3147.071034,3370.709145,218.48252


In [9]:
def compute_metric(trueFVC, predFVC, predStd):
    clipSTD = np.maximum(predStd, 70)
    delta = np.minimum(np.abs(trueFVC, predFVC), 1000)
    metric = -(np.sqrt(2) * delta / clipSTD) - np.log(np.sqrt(2) * clipSTD)  
    return metric

In [10]:
print('Metric:', compute_metric(train['FVC'].values, train['y_pred'].values, train['predStd'].values).mean())

Metric: -13.0008946634282


In [11]:
test.shape

(5, 8)

In [12]:
submission

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12
...,...,...,...,...,...
725,ID00419637202311204720264_133,2000,100,ID00419637202311204720264,133
726,ID00421637202311550012437_133,2000,100,ID00421637202311550012437,133
727,ID00422637202311677017371_133,2000,100,ID00422637202311677017371,133
728,ID00423637202312137826377_133,2000,100,ID00423637202312137826377,133


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 non-null      object 
 6   SmokingStatus  5 non-null      object 
 7   test_value     5 non-null      int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 448.0+ bytes


In [14]:
submission

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12
...,...,...,...,...,...
725,ID00419637202311204720264_133,2000,100,ID00419637202311204720264,133
726,ID00421637202311550012437_133,2000,100,ID00421637202311550012437,133
727,ID00422637202311677017371_133,2000,100,ID00422637202311677017371,133
728,ID00423637202312137826377_133,2000,100,ID00423637202312137826377,133


In [15]:
submission.describe()

Unnamed: 0,FVC,Confidence
count,730.0,730.0
mean,2000.0,100.0
std,0.0,0.0
min,2000.0,100.0
25%,2000.0,100.0
50%,2000.0,100.0
75%,2000.0,100.0
max,2000.0,100.0


In [22]:
test = encode_and_scale(test)
dt = train.loc[train.test_value == 1, ['Patient','Percent','Age','Sex','SmokingStatus']]
test = pd.merge(submission, dt, on='Patient', how='left')

In [24]:
test

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12,-0.378051,0.822284,0.51424,-0.423715
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12,0.221260,0.113009,0.51424,-0.423715
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12,-0.050275,0.822284,0.51424,-0.423715
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12,0.080439,0.680429,0.51424,-0.423715
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12,-0.295262,0.822284,0.51424,1.468393
...,...,...,...,...,...,...,...,...,...
725,ID00419637202311204720264_133,2000,100,ID00419637202311204720264,133,-0.378051,0.822284,0.51424,-0.423715
726,ID00421637202311550012437_133,2000,100,ID00421637202311550012437,133,0.221260,0.113009,0.51424,-0.423715
727,ID00422637202311677017371_133,2000,100,ID00422637202311677017371,133,-0.050275,0.822284,0.51424,-0.423715
728,ID00423637202312137826377_133,2000,100,ID00423637202312137826377,133,0.080439,0.680429,0.51424,-0.423715


In [63]:
df = test.copy()
df['Weeks'] = df['Weeks'].astype('int')
df.sort_values(['Patient', 'Weeks'], inplace=True)

In [64]:
df

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12,-0.378051,0.822284,0.51424,-0.423715
5,ID00419637202311204720264_-11,2000,100,ID00419637202311204720264,-11,-0.378051,0.822284,0.51424,-0.423715
10,ID00419637202311204720264_-10,2000,100,ID00419637202311204720264,-10,-0.378051,0.822284,0.51424,-0.423715
15,ID00419637202311204720264_-9,2000,100,ID00419637202311204720264,-9,-0.378051,0.822284,0.51424,-0.423715
20,ID00419637202311204720264_-8,2000,100,ID00419637202311204720264,-8,-0.378051,0.822284,0.51424,-0.423715
...,...,...,...,...,...,...,...,...,...
709,ID00426637202313170790466_129,2000,100,ID00426637202313170790466,129,-0.295262,0.822284,0.51424,1.468393
714,ID00426637202313170790466_130,2000,100,ID00426637202313170790466,130,-0.295262,0.822284,0.51424,1.468393
719,ID00426637202313170790466_131,2000,100,ID00426637202313170790466,131,-0.295262,0.822284,0.51424,1.468393
724,ID00426637202313170790466_132,2000,100,ID00426637202313170790466,132,-0.295262,0.822284,0.51424,1.468393


In [67]:
df['ypredL'] = modelL.predict( df ).values
df['FVC']    = model.predict( df ).values
df['ypredH'] = modelH.predict( df ).values
df['Confidence'] = np.abs(df['ypredH'] - df['ypredL']) / 2

In [69]:
df

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks,Percent,Age,Sex,SmokingStatus,ypredL,ypredH
0,ID00419637202311204720264_-12,2607.873178,185.222901,ID00419637202311204720264,-12,-0.378051,0.822284,0.51424,-0.423715,2409.786780,2780.232581
5,ID00419637202311204720264_-11,2607.062968,185.476501,ID00419637202311204720264,-11,-0.378051,0.822284,0.51424,-0.423715,2409.347692,2780.300693
10,ID00419637202311204720264_-10,2606.252759,185.730100,ID00419637202311204720264,-10,-0.378051,0.822284,0.51424,-0.423715,2408.908604,2780.368805
15,ID00419637202311204720264_-9,2605.442549,185.983700,ID00419637202311204720264,-9,-0.378051,0.822284,0.51424,-0.423715,2408.469516,2780.436916
20,ID00419637202311204720264_-8,2604.632340,186.237300,ID00419637202311204720264,-8,-0.378051,0.822284,0.51424,-0.423715,2408.030427,2780.505028
...,...,...,...,...,...,...,...,...,...,...,...
709,ID00426637202313170790466_129,2585.435569,212.613773,ID00426637202313170790466,129,-0.295262,0.822284,0.51424,1.468393,2450.837162,2876.064709
714,ID00426637202313170790466_130,2584.625360,212.867373,ID00426637202313170790466,130,-0.295262,0.822284,0.51424,1.468393,2450.398074,2876.132821
719,ID00426637202313170790466_131,2583.815150,213.120973,ID00426637202313170790466,131,-0.295262,0.822284,0.51424,1.468393,2449.958986,2876.200932
724,ID00426637202313170790466_132,2583.004940,213.374573,ID00426637202313170790466,132,-0.295262,0.822284,0.51424,1.468393,2449.519898,2876.269044


In [70]:
df[['Patient_Week', 'FVC', 'Confidence']].to_csv('Submission.csv')