In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 14, 8

import os
import re
import tqdm.notebook as tqdm

from IPython.display import display_html
import seaborn as sns
import matplotlib.pyplot as plt
from skimage import exposure
# import cv2
from pandas.plotting import scatter_matrix
from PIL import Image
from IPython.display import Image as show_gif
import scipy.misc
import matplotlib
from kneed import KneeLocator

from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import quantreg

# import pydicom
# from pydicom.filereader import read_dicomdir

## Using Multi-Linear Regression:

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

## Re-formatting Test and Submission dataframes:

In [3]:
submission['Patient'] = submission['Patient_Week'].map(lambda x: x.split('_')[0])
submission['Weeks'] = submission['Patient_Week'].map(lambda x: x.split('_')[1])

In [4]:
train = pd.concat((train, test))
train.sort_values(['Patient', 'Weeks'], inplace=True)
train

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker
...,...,...,...,...,...,...,...
1544,ID00426637202313170790466,13,2712,66.594637,73,Male,Never smoked
1545,ID00426637202313170790466,19,2978,73.126412,73,Male,Never smoked
1546,ID00426637202313170790466,31,2908,71.407524,73,Male,Never smoked
1547,ID00426637202313170790466,43,2975,73.052745,73,Male,Never smoked


In [5]:
train['Sex'] = pd.factorize(train['Sex'], sort=True)[0]
train['SmokingStatus'] = pd.factorize(train['SmokingStatus'], sort=True)[0]

In [6]:
# Standard Scaling:
sc = StandardScaler()
train[['Percent', 'Age', 'Sex', 'SmokingStatus']] = sc.fit_transform(train[['Percent',
                                                                                     'Age', 'Sex', 'SmokingStatus']])
train

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,-0.981140,1.673414,0.51424,-0.423715
1,ID00007637202177411956430,5,2214,-1.109586,1.673414,0.51424,-0.423715
2,ID00007637202177411956430,7,2061,-1.304161,1.673414,0.51424,-0.423715
3,ID00007637202177411956430,9,2144,-1.198607,1.673414,0.51424,-0.423715
4,ID00007637202177411956430,11,2069,-1.293987,1.673414,0.51424,-0.423715
...,...,...,...,...,...,...,...
1544,ID00426637202313170790466,13,2712,-0.559597,0.822284,0.51424,1.468393
1545,ID00426637202313170790466,19,2978,-0.229489,0.822284,0.51424,1.468393
1546,ID00426637202313170790466,31,2908,-0.316360,0.822284,0.51424,1.468393
1547,ID00426637202313170790466,43,2975,-0.233212,0.822284,0.51424,1.468393


In [7]:
# Quant-reg Model:
modelL = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.25)
model = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.5)
modelH = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus',
                           train).fit(q = 0.75)



In [8]:
model.summary()

0,1,2,3
Dep. Variable:,FVC,Pseudo R-squared:,0.609
Model:,QuantReg,Bandwidth:,121.5
Method:,Least Squares,Sparsity:,654.2
Date:,"Sat, 05 Sep 2020",No. Observations:,1554.0
Time:,13:22:17,Df Residuals:,1548.0
,,Df Model:,5.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2723.9011,14.105,193.113,0.000,2696.234,2751.568
Weeks,-0.8102,0.359,-2.258,0.024,-1.514,-0.106
Percent,641.7857,8.535,75.191,0.000,625.043,658.528
Age,-115.8960,8.363,-13.858,0.000,-132.300,-99.491
Sex,429.4419,8.955,47.955,0.000,411.876,447.007
SmokingStatus,20.4373,8.925,2.290,0.022,2.932,37.943


In [9]:
train['y_predL'] = modelL.predict(train).values
train['y_pred'] = model.predict(train).values
train['y_predH'] = modelH.predict(train).values

#Taking Quartile Deviation to measure confidence:
# train['predSTD'] = 1/2 * np.abs(train['y_predH'], train['y_predL'])

train['predStd'] = 1/2 * (train['y_predH'] - train['y_predL'])
train.head(10)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,y_predL,y_pred,y_predH,predStd
0,ID00007637202177411956430,-4,2315,-0.98114,1.673414,0.51424,-0.423715,1929.933801,2115.694765,2268.520495,169.293347
1,ID00007637202177411956430,5,2214,-1.109586,1.673414,0.51424,-0.423715,1849.393889,2025.968534,2181.002372,165.804242
2,ID00007637202177411956430,7,2061,-1.304161,1.673414,0.51424,-0.423715,1732.496086,1899.472326,2047.633026,157.56847
3,ID00007637202177411956430,9,2144,-1.198607,1.673414,0.51424,-0.423715,1794.556661,1965.594982,2120.193839,162.818589
4,ID00007637202177411956430,11,2069,-1.293987,1.673414,0.51424,-0.423715,1736.806119,1902.760941,2054.886156,159.040019
5,ID00007637202177411956430,17,2101,-1.253292,1.673414,0.51424,-0.423715,1758.437133,1924.017495,2083.217559,162.390213
6,ID00007637202177411956430,29,2000,-1.381737,1.673414,0.51424,-0.423715,1676.579955,1831.860636,1995.903771,159.661908
7,ID00007637202177411956430,41,2064,-1.300346,1.673414,0.51424,-0.423715,1719.841983,1874.373745,2052.566578,166.362298
8,ID00007637202177411956430,57,2057,-1.309248,1.673414,0.51424,-0.423715,1707.508485,1855.69712,2047.548267,170.019891
9,ID00009637202177434476278,8,3660,0.384884,0.254864,0.51424,-0.423715,2933.744104,3147.071034,3370.709145,218.48252


In [10]:
def compute_metric(trueFVC, predFVC, predStd):
    clipSTD = np.maximum(predStd, 70)
    delta = np.minimum(np.abs(trueFVC, predFVC), 1000)
    metric = -(np.sqrt(2) * delta / clipSTD) - np.log(np.sqrt(2) * clipSTD)  
    return metric

In [11]:
print('Metric:', compute_metric(train['FVC'].values, train['y_pred'].values, train['predStd'].values).mean())

Metric: -13.0008946634282


In [12]:
test

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [13]:
test['Sex'] = pd.factorize(test['Sex'])[0]
test['SmokingStatus'] = pd.factorize(test['SmokingStatus'])[0]
# Standard Scaling:
sc = StandardScaler()
test[['Percent', 'Age', 'Sex', 'SmokingStatus']] = sc.fit_transform(test[['Percent',
                                                                                     'Age', 'Sex', 'SmokingStatus']])
test = pd.merge(submission, test[['Patient', 'Percent', 'Age', 'Sex',
                                 'SmokingStatus']], on='Patient')
test.sort_values(['Patient', 'Weeks'], inplace=True)

In [14]:
test

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks,Percent,Age,Sex,SmokingStatus
11,ID00419637202311204720264_-1,2000,100,ID00419637202311204720264,-1,-1.306936,0.618853,0.0,-0.5
2,ID00419637202311204720264_-10,2000,100,ID00419637202311204720264,-10,-1.306936,0.618853,0.0,-0.5
1,ID00419637202311204720264_-11,2000,100,ID00419637202311204720264,-11,-1.306936,0.618853,0.0,-0.5
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12,-1.306936,0.618853,0.0,-0.5
10,ID00419637202311204720264_-2,2000,100,ID00419637202311204720264,-2,-1.306936,0.618853,0.0,-0.5
...,...,...,...,...,...,...,...,...,...
691,ID00426637202313170790466_95,2000,100,ID00426637202313170790466,95,-0.938503,0.618853,0.0,2.0
692,ID00426637202313170790466_96,2000,100,ID00426637202313170790466,96,-0.938503,0.618853,0.0,2.0
693,ID00426637202313170790466_97,2000,100,ID00426637202313170790466,97,-0.938503,0.618853,0.0,2.0
694,ID00426637202313170790466_98,2000,100,ID00426637202313170790466,98,-0.938503,0.618853,0.0,2.0


In [15]:
test['y_predL'] = modelL.predict(test).values 
test['FVC'] = model.predict(test).values 
test['y_predH'] = modelH.predict(test).values 

#Quartile Deviation:
test['Confidence'] = 1/2 * (test['y_predH'] - test['y_predL'])

AttributeError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
'DataFrame' object has no attribute 'dtype'