In [280]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy

from torch.utils.data import DataLoader, TensorDataset, random_split, Subset
from torch import nn
import torch.nn.functional as F

from sklearn import ensemble
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin

from IPython.display import clear_output

In [None]:
dataset = pd.read_csv("train.csv")
dataset.head()

In [None]:
dataset['Target'].value_counts()

In [283]:
class EdPipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.education = [35, 36, 37, 26, 11, 30, 29, 14, 10, 12, 18, 
            13, 27, 19, 9, 1, 25, 20, 22, 31, 33, 6, 2, 40, 
            3, 4, 43, 41, 42, 39, 5, 44, 34]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.array(X)
        for i in range(len(X)):
            try:
                X[i, 0] = self.education.index(X[i, 0])
            except ValueError:
                X[i, 0] = len(self.education)
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features

class OccPipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.occupation = [99, 0, 90, 195, 9, 192, 193, 191, 194, 6, 161, 
            163, 7, 171, 172, 173, 174, 175, 8, 181, 182, 183, 5, 151, 152, 
            153, 154, 4, 141, 143, 144, 3, 131, 132, 134, 135, 2, 121, 122, 
            123, 124, 125, 1, 112, 114, 10, 101, 102, 103]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.array(X)
        for i in range(len(X)):
            try:
                X[i, 0] = self.occupation.index(X[i, 0])
            except ValueError:
                X[i, 0] = len(self.occupation)
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features

In [None]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('onehot_encoder', OneHotEncoder(min_frequency=0.01))
    ])

ed_pipeline = Pipeline([
    ('ed_custom', EdPipeline()),
    ('std_scaler', StandardScaler())
])

occ_pipeline = Pipeline([
    ('occ_custom', OccPipeline()),
    ('std_scaler', StandardScaler())
])

num_attribs = ['Application order', 'Previous qualification (grade)', 'Admission grade', 
    'Age at enrollment', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)',
    'Unemployment rate', 'Inflation rate', 'GDP']
cat_attribs = ['Marital status', 'Application mode', 'Course', 'Daytime/evening attendance',
    'Nacionality', 'Displaced', 'Educational special needs', 'Debtor', 
    'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']
ed_attribs = ['Previous qualification', "Mother's qualification", "Father's qualification"]
occ_attribs = ["Mother's occupation", "Father's occupation"]

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
    ('ed', ed_pipeline, ed_attribs),
    ('occ', occ_pipeline, occ_attribs)
])

data_prepared = full_pipeline.fit_transform(dataset)
feauture_names = full_pipeline.get_feature_names_out()
dataset_prepared = pd.DataFrame(data_prepared, columns=feauture_names)
dataset_prepared.head()

In [None]:
targets = dataset['Target'].to_numpy().reshape(-1, 1)
ord_encoder = OrdinalEncoder()
targets = ord_encoder.fit_transform(targets)
in_features = data_prepared.shape[1]
data_prepared.shape, targets.shape

In [286]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

data_prepared_t = torch.tensor(data_prepared, dtype=torch.float, device=device)
targets_t = torch.tensor(targets.flatten(), dtype=torch.long, device=device)
tensor_dataset = TensorDataset(data_prepared_t, targets_t)

train_dataset, valid_dataset = random_split(tensor_dataset, [0.8, 0.2])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=False)

In [314]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 3),
        )

        nn.init.kaiming_uniform_(self.layers[0].weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layers[3].weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.layers[6].weight, nonlinearity='relu')


    def forward(self, x):
        x = self.layers(x)
        return x

In [288]:
# a function to train a model

def compute_error(model, data_loader, criterion, c_sum=False):
    model.eval()
    losses, num_of_el = 0, 0
    with torch.no_grad():
        for x, y in data_loader:
            outputs = model(x)
            loss = criterion(outputs, y)
            if not c_sum: loss *= len(y)
            losses += loss
            num_of_el += len(y)
    return losses / num_of_el


def train_model(model: nn.Module,
              train_loader: DataLoader,
              valid_loader: DataLoader,
              num_epochs: int,
              optimizer: torch.optim.Optimizer,
              criterion,
              verbose: bool = True,
              verbose_plot: bool = False
              ) -> float:

    best_epoch = None
    best_params = None
    best_val_loss = np.inf
    train_losses, valid_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        _iter = 1
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if verbose:
                if _iter % 10 == 0:
                    print(f"Minibatch {_iter:>6}    |  loss {loss.item():>5.2f}  |")
            _iter += 1

        val_loss = compute_error(model, valid_loader, criterion)

        if val_loss < best_val_loss:
            best_epoch = epoch
            best_val_loss = val_loss
            best_params = [copy.deepcopy(p.detach().cpu()) for p in model.parameters()]

        if verbose:
            clear_output(True)
            m = f"After epoch {epoch:>2} | valid loss: {val_loss:>5.2f}"
            print("{0}\n{1}\n{0}".format("-" * len(m), m))

        if verbose_plot:
            train_loss = compute_error(model, train_loader, criterion)
            train_losses.append(train_loss.detach().cpu())
            valid_losses.append(val_loss.detach().cpu())

    if best_params is not None:
        if verbose:
            print(f"\nLoading best params on validation set in epoch {best_epoch} with loss {best_val_loss:.2f}")
        with torch.no_grad():
            for param, best_param in zip(model.parameters(), best_params):
                param[...] = best_param

    if verbose_plot:
        plt.figure(figsize=(6, 3))
        plt.plot(train_losses, c='b', label='train')
        plt.plot(valid_losses, c='r', label='valid')
        plt.grid(ls=':')
        plt.legend()
        plt.show()

    return best_val_loss

In [None]:
model = Net()
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, valid_loader, 30, optimizer, criterion, verbose_plot=True)

In [None]:
# check accuracy on a single and multiple models

def accuracy(outputs, y):
    pred = outputs.argmax(dim=1)
    return sum(pred == y)

print(compute_error(model, valid_loader, accuracy, c_sum=True))

In [None]:
# split the dataset into k folds and train k models on them

def train_kfold(Net, dataset, n_splits=5, num_epochs=10, batch_size=32, learning_rate=0.03):
    models = []
    scores = []

    kf = KFold(n_splits=n_splits, shuffle=True)
    for train_ids, val_ids in kf.split(dataset):
        train_sub = Subset(dataset, train_ids)
        valid_sub = Subset(dataset, val_ids)
        train_loader = DataLoader(train_sub, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False)

        model = Net().to(device)
        optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        train_model(model, train_loader, valid_loader, num_epochs=num_epochs, 
                    optimizer=optimizer, criterion=criterion, verbose=True)

        scores.append(compute_error(model, valid_loader, criterion).detach().cpu())
        models.append((model, 0))

    return models, scores


# training the models and checking the scores
models, scores = train_kfold(Net, train_dataset, n_splits=10, num_epochs=30, batch_size=64, learning_rate=0.001)
clear_output(False)
print(scores)

In [None]:
X_train, y_train = [], []
for x_batch, y_batch in train_loader:
    for x, y in zip(x_batch, y_batch):
        X_train.append(x.detach().cpu().numpy())
        y_train.append(y.detach().cpu().numpy())
X_train = np.array(X_train)
y_train = np.array(y_train)

X_valid, y_valid = [], []
for x_batch, y_batch in valid_loader:
    for x, y in zip(x_batch, y_batch):
        X_valid.append(x.detach().cpu().numpy())
        y_valid.append(y.detach().cpu().numpy())
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)

In [None]:
rf_clf = ensemble.RandomForestClassifier(n_estimators=150, max_depth=60, criterion='log_loss', n_jobs=-1)
rf_clf.fit(X_train, y_train)

pred = rf_clf.predict(X_valid)
sum(pred == y_valid) / len(pred)

In [None]:
gb_clf = ensemble.GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

pred = gb_clf.predict(X_valid)
sum(pred == y_valid) / len(pred)

In [None]:
for i in range(3):
    rf_clf = ensemble.RandomForestClassifier(n_estimators=150, max_depth=60, criterion='log_loss', n_jobs=-1)
    rf_clf.fit(X_train, y_train)
    models.append((rf_clf, 1))
    
    gb_clf = ensemble.GradientBoostingClassifier()
    gb_clf.fit(X_train, y_train)    
    models.append((gb_clf, 1))

In [None]:
# a function to predict outputs with k models

def predict_ensemble(models, x):
    predictions = []
    for model, t in models:
        if t == 0:
            model.eval()
            model_preds = []
            with torch.no_grad():
                pred = model(x)
                model_preds.append(pred)
            predictions.append(torch.cat(model_preds))
        else:
            pred = model.predict(x.detach().cpu())
            t_pred = torch.zeros((x.shape[0], 3), device=device)
            t_pred[np.arange(x.shape[0]), pred] = 1
            predictions.append(t_pred)

    predictions = torch.mean(torch.stack(predictions), dim=0)
    return predictions


def evaluate_ensemble(models, data_loader, criterion, c_sum=False):
    losses, num_of_el = 0, 0
    with torch.no_grad():
        for x, y in data_loader:
            outputs = predict_ensemble(models, x)
            loss = criterion(outputs, y)
            if not c_sum: loss *= len(y)
            losses += loss
            num_of_el += len(y)
    return losses / num_of_el


print(evaluate_ensemble(models, valid_loader, criterion))

In [None]:
# check accuracy on a single and multiple models

def accuracy(outputs, y):
    pred = outputs.argmax(dim=1)
    return sum(pred == y)

print(compute_error(models[0][0], valid_loader, accuracy, c_sum=True))
print(evaluate_ensemble(models, valid_loader, accuracy, c_sum=True))

In [None]:
dataset_test = pd.read_csv("test.csv")
dataset_test.head()

In [None]:
data_test = full_pipeline.fit_transform(dataset_test)
data_test.shape

In [None]:
data_test_t = torch.tensor(data_test, device=device, dtype=torch.float)
logits_test = predict_ensemble(models, data_test_t)

pred_test = logits_test.argmax(dim=1)
pred_test_c = np.array(ord_encoder.categories_).flatten()[np.array(pred_test.detach().cpu())]
pred_test_c

In [None]:
dataset_test['id'].values

In [None]:
submit_dataset = pd.DataFrame({ 'id': dataset_test['id'].values,'Target': pred_test_c })
submit_dataset.to_csv('submission.csv', index=False)