In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.preprocessing import MinMaxScaler

        
        
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split

/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv


In [2]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [5]:
def convert_categorical(category, df):
    s = str(category)
    df = pd.get_dummies(df, columns=[s])
    return df

def encode_df(df):
    
    df_encoded = convert_categorical('cp_type', df)
    df_encoded1 = convert_categorical('cp_dose', df_encoded)
    return(df_encoded1)

train_features_encoded = encode_df(train_features)

if 'cp_dose_D1'and'cp_type_trt_cp' in train_features_encoded.columns:
    print("encoded columns successfully")
else:
    print("Nope")

encoded columns successfully


In [6]:
train_features_encoded.shape, train_targets_scored.shape


((23814, 878), (23814, 207))

In [7]:
feature_columns = train_features_encoded.columns[1:]
target_columns = train_targets_scored.columns[1:]

In [8]:
train_cat = train_features_encoded.merge(train_targets_scored, on='sig_id')


In [11]:
dummy_df = train_cat.loc[:, train_cat.columns != 'sig_id']
df_float = dummy_df.astype(float)
scaler = MinMaxScaler()
df_float_scaled = pd.DataFrame(scaler.fit_transform(df_float), columns = df_float.columns)
df_float_scaled['sig_id'] = train_features_encoded['sig_id']
df_float_scaled.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23814 entries, 0 to 23813
Columns: 1084 entries, cp_time to sig_id
dtypes: float64(1083), object(1)
memory usage: 196.9+ MB


In [33]:
class TrainDataset(Dataset):
    def __init__(self, df, feature_columns, target_columns):
        
        self.features  = df[feature_columns].values
        self.targets = df[target_columns].values
        
    def sizes(self):
        print("features size = ", self.features.shape[1])
        print("targets size = ", self.targets.shape[1])

        
    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx]).float()
        target = torch.tensor(self.targets[idx]).float()
        
        return feature,target

In [34]:
full_dataset = TrainDataset(df_float_scaled, feature_columns, target_columns)

In [35]:
full_dataset.sizes()

features size =  877
targets size =  206


In [36]:
train_size = int(0.9 * len(full_dataset))  ## 90/10 split
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers = 8)

val_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle = True, num_workers = 8)

print(len(train_loader), "batches ")
print(len(val_loader), " batches ")

335 batches 
38  batches 


In [37]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.dropout = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(877, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 128)

        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, 206)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))

        x = F.relu(self.fc5(x))

        return (self.fc6(x))
    
model = Model()
print(model)
model = model.cuda()

Model(
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=877, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=128, bias=True)
  (fc5): Linear(in_features=128, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=206, bias=True)
)


In [27]:
inp, label = next(iter(train_loader))

In [31]:
model(inp.cuda())

tensor([[ 0.0350, -0.0651,  0.0755,  ..., -0.0360,  0.0968,  0.1169],
        [ 0.0332, -0.0730,  0.0826,  ..., -0.0340,  0.0816,  0.1152],
        [ 0.0245, -0.0710,  0.0662,  ..., -0.0588,  0.0934,  0.1179],
        ...,
        [ 0.0384, -0.0645,  0.0867,  ..., -0.0397,  0.0858,  0.1090],
        [ 0.0409, -0.0692,  0.0704,  ..., -0.0400,  0.0861,  0.1148],
        [ 0.0428, -0.0667,  0.0807,  ..., -0.0456,  0.0982,  0.1030]],
       device='cuda:0', grad_fn=<AddmmBackward>)