In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import os
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

le = LabelEncoder()
ROOT = Path("../input/osic-pulmonary-fibrosis-progression")

In [2]:
train = pd.read_csv(ROOT / 'train.csv')
test = pd.read_csv(ROOT / 'test.csv')
sub = pd.read_csv(ROOT / 'sample_submission.csv')

In [3]:
# create training data

trainData = []
for p in train['Patient'].unique():
    patientData = train[train['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
    for i, week in enumerate(patientData['Weeks'].iloc[1:]):
        fvc = patientData.iloc[i, 2]
        trainDataPoint = firstMeasure + [week, fvc]
        trainData.append(trainDataPoint)
trainData = pd.DataFrame(trainData)

trainData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'target_FVC']
trainData['delta_week'] = trainData['target_week'] - trainData['first_week']
trainData.drop(columns = ['first_Percent', 'target_week', 'first_week'], inplace = True)

In [4]:
# create testing data
subSplit = np.array(list(sub['Patient_Week'].apply(lambda x: x.split('_')).values))
testData = []
for p in np.unique(subSplit[:, 0]):
    patientData = test[test['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
    for week in subSplit[subSplit[:, 0] == p, 1]:
        testDataPoint = firstMeasure + [week]
        testData.append(testDataPoint)
testData = pd.DataFrame(testData)
testData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week']

testData['delta_week'] = testData['target_week'].map(int) - testData['first_week']
testData.drop(columns = ['first_Percent', 'first_week'], inplace = True)

In [5]:
# fe engineering
# trainData.drop(columns = ['PatientID'], inplace = True)
# testData.drop(columns = ['PatientID'], inplace = True)

trainData['Sex'] = le.fit_transform(trainData['Sex'])
testData['Sex'] = le.transform(testData['Sex'])

trainData['SmokingStatus'] = le.fit_transform(trainData['SmokingStatus'])
testData['SmokingStatus'] = le.transform(testData['SmokingStatus'])

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

model = LinearRegression()
model.fit(trainData.drop(columns = ['PatientID', 'target_FVC']), trainData['target_FVC'])
prediction = model.predict(testData.drop(columns = ['PatientID', 'target_week']))

In [7]:
sub = []
for i in range(testData.shape[0]):
    patient, week, pred = testData.loc[i, 'PatientID'], testData.loc[i, 'target_week'], prediction[i]
    confidence = 225
    sub.append([patient + '_' + str(week), pred, confidence])
sub = pd.DataFrame(sub)
sub.columns = ['Patient_Week', 'FVC', 'Confidence']

In [8]:
sub.to_csv('submission.csv', index=False)