In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

import numpy as np
import pandas as pd

import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import matplotlib.patches as patches
import seaborn as sns

from pylab import rcParams

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

import eli5
import os
from tqdm import tqdm
import gc
import random    
import math
import psutil
import pickle
import datetime



In [4]:
warnings.filterwarnings('ignore')

root = "../../data/Gamma_Log_Facies_Type_Prediction/"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

plt.style.use('seaborn')
pd.set_option('max_columns', 150)
torch.manual_seed(RANDOM_STATE)



<torch._C.Generator at 0x114d8e9d0>

In [5]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
%%time
full_train_df = pd.read_csv(root + "Train_File.csv")
test_df = pd.read_csv(root + "Test_File.csv")
submit_df = pd.read_csv(root + "Submission_File.csv")

CPU times: user 2.82 s, sys: 583 ms, total: 3.4 s
Wall time: 3.45 s


In [9]:
%time
reduce_mem_usage(full_train_df, use_float16=True);
reduce_mem_usage(test_df, use_float16=True);

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs
Memory usage of dataframe is 134.28 MB
Memory usage after optimization is: 29.37 MB
Decreased by 78.1%
Memory usage of dataframe is 67.14 MB
Memory usage after optimization is: 117.77 MB
Decreased by -75.4%


In [12]:
class Kannada_MNIST_data(Dataset):
    
    def __init__(self, df):
        n_pixels = 28 * 28
        
        if "label" not in df.columns:
            # test data
            self.X = df.iloc[:,1:].values.reshape((-1,28,28)).astype(np.uint8)[:,:,:,None]
            self.y = None
        else:
            # training data
            self.X = df.iloc[:,1:].values.reshape((-1,28,28)).astype(np.uint8)[:,:,:,None]
            self.y = torch.from_numpy(df.iloc[:,0].values)
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.transform(self.X[idx]), self.y[idx]
        else:
            return self.transform(self.X[idx])

In [13]:
train_df, valid_df = train_test_split(full_train_df, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

In [15]:
batch_size = 256

train_dataset = Kannada_MNIST_data(train_df)
valid_dataset = Kannada_MNIST_data(valid_df)
test_dataset = Kannada_MNIST_data(test_df)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

ValueError: cannot reshape array of size 10560000 into shape (28,28)

In [None]:
show_batch(train_loader)

In [None]:
show_batch(valid_loader)

In [None]:
show_batch(test_loader)

In [None]:
def train(model, train_loader):
    batch_loss = 0.0
    batch_corrects = 0.0
    model.train()
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        preds = torch.max(outputs, 1)[1]
        batch_loss += loss.item()
        batch_corrects += torch.sum(preds == labels.data)
    
    return batch_loss/len(train_loader), batch_corrects.float()/len(train_dataset)

In [None]:
def evaluate(model, valid_loader):
    loss = 0.0
    corrects = 0.0
    
    model.eval()
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = Variable(inputs), Variable(labels)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            loss += F.cross_entropy(outputs, labels, reduction='mean').item()
            pred = outputs.data.max(1, keepdim=True)[1]
            corrects += pred.eq(labels.data.view_as(pred)).cpu().sum()
               
    return loss/len(valid_loader), corrects.float()/len(valid_dataset)

In [None]:
def prediciton(model, data_loader):
    model.eval()
    test_pred = torch.LongTensor()
    
    for i, data in enumerate(data_loader):
        data = Variable(data, volatile=True)
        data = data.to(device)
        test_pred = test_pred.to(device)
        output = model(data)
        pred = output.data.max(1, keepdim=True)[1]
        test_pred = torch.cat((test_pred, pred), dim=0)
        
    return test_pred

In [17]:

cell = nn.LSTM(input_size=10, hidden_size=10, batch_first=True)

class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(64, 64)
        self.classifier = nn.Linear(64, 4)
        
        for m in self.lstm.children():
            inn.init.xavier_uniform_(m.weight)
        
        for m in self.classifier.children():
            nn.init.xavier_uniform_(m.weight)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.classifier(lstm_out.view(len(sentence), -1))
        return out

In [None]:
model = Conv2Class2Net()
model.to(device)
epochs = 30
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
scheduler = ReduceLROnPlateau(
    optimizer,
    factor=0.7,
    mode="max",
    verbose=True,
    patience=1,
    threshold=1e-3,
    eps=1e-06
)


In [None]:
%%time

epoch_loss_history = []
epoch_corrects_history = []
val_loss_history = []
val_corrects_history = []

for epoch in range(epochs):
    epoch_loss, epoch_corrects = train(model, train_loader)
    val_loss, val_corrects = evaluate(model, valid_loader)
    
    epoch_loss_history.append(epoch_loss)
    epoch_corrects_history.append(epoch_corrects)
    val_loss_history.append(val_loss)
    val_corrects_history.append(val_corrects)
    
    print('epoch:', (epoch+1))
    print('training loss: {:.4f}, training acc {:.4f} '.format(epoch_loss, epoch_corrects.item()))
    print('validation loss: {:.4f}, validation acc {:.4f} '.format(val_loss, val_corrects.item()))
    scheduler.step(val_corrects)

In [None]:
plt.plot(epoch_loss_history, label='training loss')
plt.plot(val_loss_history, label='validation loss')
plt.legend()


In [None]:
plt.plot(epoch_corrects_history, label='training accuracy')
plt.plot(val_corrects_history, label='validation accuracy')
plt.legend()


In [None]:
%%time
test_pred = prediciton(model, test_loader)
test_pred = test_pred.to(torch.device('cpu'))

In [None]:
ids = test_df['id']
submission = pd.DataFrame({'id': ids,'label': test_pred.numpy().T[0]})
submission.to_csv(path_or_buf ="submission.csv", index=False)
submission.head()