In [43]:
import os
import random
from itertools import islice
from collections.abc import Sequence
from copy import deepcopy
import tqdm.notebook as tq

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [25]:
eps = 1e-9

SEED = 81020204
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

rng = np.random.default_rng(seed=SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
palette = px.colors.qualitative.Plotly
sns.set(style="whitegrid")
# sns.reset_defaults()

sns.palplot(palette)

---

In [27]:
try:
    X_train = pd.read_csv('datasets/train_x.csv', delimiter=',', index_col=0)
    y_train = pd.read_csv('datasets/train_y.csv', delimiter=',', index_col=0)
    X_test  = pd.read_csv('datasets/test_x.csv',  delimiter=',', index_col=0)
except Exception:
    print('No such files')

In [None]:
X_train.info()

In [None]:
X_train.columns

In [None]:
X_test.columns

In [None]:
X_test.head()

In [None]:
X_train.head()

In [None]:
X_test_tmp = X_test.reset_index()
X_test_tmp = X_test_tmp.set_index('id')

X_test = X_test_tmp
del X_test_tmp

X_test.head()

In [None]:
pd.concat([X_train['0'].describe().to_frame().T, X_test['0'].describe().to_frame().T])

---

In [None]:
y_train.shape, X_test.shape 

In [None]:
pd.concat([X_train, y_train], axis=1).isna().sum().sum()

In [None]:
y_train.value_counts()

In [None]:
y_train.describe().T

In [None]:
fig = plt.figure(figsize=(10, 6))

sns.histplot(data=y_train)
plt.title('Distribution of target', fontsize=14)
plt.show()

In [None]:
mask = np.where(y_train < 1960, False, True).squeeze()
print(mask.shape[0] - mask.sum())

In [None]:
y_train = y_train[mask]
X_train = X_train[mask]

X_train.shape, y_train.shape

---

---

In [None]:
std_scaler = StandardScaler()

X_trian = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

X_train.head()

In [None]:
unique_vals = np.unique(y_train.values)
categories = pd.Series(unique_vals).dropna().sort_values().reset_index(drop=True).to_list()

ord_enc = OrdinalEncoder(categories=[categories], handle_unknown="use_encoded_value", unknown_value=-1)
y_train_enc = ord_enc.fit_transform(y_train)
y_test_enc =