In [1]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch.nn as nn
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [3]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)#set all gpus seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False#if input data type and channels' changes arent' large use it improve train efficient
        torch.backends.cudnn.enabled = True
    
seed_everything(42)

In [4]:
class cfgOsic:
    ROOT = "../data/"
    device = torch.device('cuda')

In [5]:
tr = pd.read_csv(f"{cfgOsic.ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{cfgOsic.ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{cfgOsic.ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

add infos


In [6]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [7]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [8]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [9]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [12]:
#aggiunta altezza

def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['min_FVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['min_FVC'] / (21.78 - 0.101 * row['Age'])

data['Height'] = data.apply(calculate_height, axis=1)

data['WeeksPassed'] = data['Weeks'] - data['min_week']

In [13]:
COLS = ['Sex','SmokingStatus'] #,'Age'
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)

In [14]:
def scale_feature(series):
    return (series - series.min()) / (series.max() - series.min())


data['age'] = scale_feature(data['Age'])
data['BASE'] = scale_feature(data['min_FVC'])
data['week'] = scale_feature(data['base_week'])
data['percent'] = scale_feature(data['Percent'])
data['height'] = scale_feature(data['Height'])
data['week_passed'] = scale_feature(data['WeeksPassed'])
FE += ['age','percent','week','BASE', 'height', 'week_passed']

In [15]:
data

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,Patient_Week,...,Female,Ex-smoker,Never smoked,Currently smokes,age,BASE,week,percent,height,week_passed
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,...,0,1,0,0,0.769231,0.241456,0.179012,0.236393,0.211783,0.179012
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,...,0,1,0,0,0.769231,0.241456,0.234568,0.215941,0.211783,0.234568
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,...,0,1,0,0,0.769231,0.241456,0.246914,0.184960,0.211783,0.246914
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,...,0,1,0,0,0.769231,0.241456,0.259259,0.201767,0.211783,0.259259
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,...,0,1,0,0,0.769231,0.241456,0.271605,0.186580,0.211783,0.271605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2265,ID00426637202313170790466,129,2925,71.824968,73,Male,Never smoked,test,100.0,ID00426637202313170790466_129,...,0,0,1,0,0.615385,0.354755,0.975309,0.345604,0.317560,0.975309
2266,ID00426637202313170790466,130,2925,71.824968,73,Male,Never smoked,test,100.0,ID00426637202313170790466_130,...,0,0,1,0,0.615385,0.354755,0.981481,0.345604,0.317560,0.981481
2267,ID00426637202313170790466,131,2925,71.824968,73,Male,Never smoked,test,100.0,ID00426637202313170790466_131,...,0,0,1,0,0.615385,0.354755,0.987654,0.345604,0.317560,0.987654
2268,ID00426637202313170790466,132,2925,71.824968,73,Male,Never smoked,test,100.0,ID00426637202313170790466_132,...,0,0,1,0,0.615385,0.354755,0.993827,0.345604,0.317560,0.993827


In [None]:

tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
sub = data.loc[data.WHERE=='test']
del data