## 0. Preparation

In [1]:
import pandas as pd
from functions import *
path = "/app/Final/code"
# path = "."
# This is the dataset processed from the midterm
train_size = 14993
data_df = pd.read_csv(path + "/data/data_df_proc.csv")[:train_size]
data_df.head()

cols_to_drop = ["Name", "RescuerID", "VideoAmt", "Description", "PetID", "PhotoAmt"]
to_drop_columns = [
    "PetID",
    "Name",
    "RescuerID",
    "Description",
    "BreedName_full",
    "Breed1Name",
    "Breed2Name",
]
data_df.drop(cols_to_drop + to_drop_columns, axis=1, inplace=True)

# Fill missing values with mean
# This is necessary only for the neural network
# data_df.fillna(data_df.mean(), inplace=True)

# Embedding the categorical variables using nn.Embedding
cat_cols = [
    "Breed1",
    "Breed2",
    "Color1",
    "Color2",
    "Color3",
    "Gender",
    "State",
    "Breed_full",
    "Color_full",
    "hard_interaction",
]


In [2]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for cat_col in cat_cols:
    label_encoders[cat_col] = LabelEncoder()
    data_df[cat_col] = label_encoders[cat_col].fit_transform(data_df[cat_col])

emb_c = {n: len(col.unique()) for n, col in data_df.items() if n in cat_cols}
emb_cols = emb_c.keys()  # names of columns chosen for embedding
emb_szs = [
    (c, min(5, (c + 1) // 2)) for _, c in emb_c.items()
]  # embedding sizes for the chosen columns

# Split data into train and validation by AdoptionSpeed and stratify
from sklearn.model_selection import train_test_split

n_cont = len(data_df.columns) - len(emb_cols) - 1  # number of continuous columns


In [3]:
from network_setting import *
model = PetFinderModel(emb_szs, n_cont)
model.load_state_dict(torch.load(path + "/model-stratify.pt"))
model.eval()

PetFinderModel(
  (embeddings): ModuleList(
    (0): Embedding(176, 5)
    (1): Embedding(135, 5)
    (2): Embedding(3, 2)
    (3-4): 2 x Embedding(7, 4)
    (5): Embedding(6, 3)
    (6): Embedding(14, 5)
    (7): Embedding(812, 5)
    (8): Embedding(63, 5)
    (9): Embedding(142, 5)
  )
  (lin1): Linear(in_features=201, out_features=512, bias=True)
  (lin2): Linear(in_features=512, out_features=256, bias=True)
  (lin3): Linear(in_features=256, out_features=128, bias=True)
  (lin4): Linear(in_features=128, out_features=32, bias=True)
  (lin5): Linear(in_features=32, out_features=1, bias=True)
  (bn1): ReLU()
  (bn2): ReLU()
  (bn3): ReLU()
  (bn4): ReLU()
  (output): ReLU()
  (emb_drop): Dropout(p=0.2, inplace=False)
  (drops): Dropout(p=0.1, inplace=False)
)

In [4]:
def pet_xgboost(params, X_train, xgb_features, split_index, print_result=True):
    oof_train_xgb = np.zeros((X_train.shape[0]))
    qwks = []

    for n_fold, (train_idx, valid_idx) in enumerate(split_index):  
        X_tr = X_train.iloc[train_idx]
        # type_weights = X_tr['Type'].value_counts(normalize=True).to_dict()
        # sample_weight = (1/X_tr['Type'].apply(lambda x: type_weights[x])).values
        X_val = X_train.iloc[valid_idx]
        
        y_tr = X_tr['AdoptionSpeed'].values    
        y_val = X_val['AdoptionSpeed'].values
            
        d_train = xgb.DMatrix(X_tr[xgb_features], y_tr)
        d_valid = xgb.DMatrix(X_val[xgb_features], y_val)
        
        since = time.time()
        if print_result:
            print('training XGB:')
        # model = xgb.train(params, d_train, num_boost_round = 10000, evals=[(d_valid,'val')],
        #                 early_stopping_rounds=100, 
        #                 verbose_eval=500)
        
        model = xgb.XGBRegressor(**params, n_estimators=10000)
        model.fit(
        X=d_train.get_data(),
        y=d_train.get_label(),  # Assuming d_train is a DMatrix object
        eval_set=[(d_valid.get_data(), d_valid.get_label())],  # Assuming d_valid is a DMatrix object
        # early_stopping_rounds=100,
        # sample_weight=sample_weight,
        verbose=500,
    )
        val_pred = model.predict(d_valid.get_data())
        
        oof_train_xgb[valid_idx] = val_pred
        
        hist = histogram(X_tr['AdoptionSpeed'].astype(int), 
                        int(np.min(X_train['AdoptionSpeed'])), 
                        int(np.max(X_train['AdoptionSpeed'])))
        tr_cdf = get_cdf(hist)
        
        pred_test_y_k = getTestScore2(val_pred, tr_cdf)
        qwk = quadratic_weighted_kappa(X_val['AdoptionSpeed'].values, pred_test_y_k)
        qwks.append(qwk)
        if print_result:
            print("QWK_2 = ", qwk, 'elapsed time:', time.time()-since)
        
    # print('overall rmse: %.5f'%rmse(oof_train_xgb, X_train['AdoptionSpeed']))
    # print('mean QWK =', np.mean(qwks), 'std QWK =', np.std(qwks))
    return np.mean(qwks), np.std(qwks)

In [5]:
emd_pair =  {n: pd.DataFrame(col.unique(),columns=[n]) for n, col in data_df.items() if n in cat_cols}
temp_dict = {}
for i, (k, v) in enumerate(emd_pair.items()):
    emb_vectors = pd.DataFrame(model.embeddings[0].weight.cpu().detach().numpy(), columns=[k +'_'+ str(i) for i in range(5)])
    v = pd.concat([v, emb_vectors], axis=1)
    temp_dict[k] = v

In [6]:
n_splits = 10
split_index = []

kfold = StratifiedKFold(n_splits=n_splits)
for train_idx, valid_idx in kfold.split(data_df, data_df['AdoptionSpeed']):
    split_index.append((train_idx, valid_idx))

## 2. XGB

### 2.1 Basic

In [7]:
params = {
        'objective': 'reg:squarederror',
        'eval_metric':'rmse',
        'tree_method':'hist',
        'eta': 0.01,
        'max_depth': 7,  
        'subsample': 0.8,  
        'colsample_bytree': 0.8,     
        'alpha': 0.05,
        'early_stopping_rounds': 100,
}
to_drop_columns = ['PetID', 'Name', 'RescuerID', 'AdoptionSpeed', 'Description',
                    'BreedName_full','Breed1Name','Breed2Name']
features = [x for x in data_df.columns if x not in to_drop_columns]
xgb_features = features

pet_xgboost(params, X_train=data_df, xgb_features= xgb_features, split_index=split_index, print_result=False)

[0]	validation_0-rmse:1.17508
[500]	validation_0-rmse:1.03235
[985]	validation_0-rmse:1.02839
[0]	validation_0-rmse:1.17499
[500]	validation_0-rmse:1.01433
[1000]	validation_0-rmse:1.00851
[1359]	validation_0-rmse:1.00714
[0]	validation_0-rmse:1.17493
[500]	validation_0-rmse:1.02191
[1000]	validation_0-rmse:1.01253
[1500]	validation_0-rmse:1.01055
[1726]	validation_0-rmse:1.01036
[0]	validation_0-rmse:1.17507
[500]	validation_0-rmse:1.02460
[1000]	validation_0-rmse:1.01788
[1300]	validation_0-rmse:1.01646
[0]	validation_0-rmse:1.17495
[500]	validation_0-rmse:1.04501
[1000]	validation_0-rmse:1.04097
[1485]	validation_0-rmse:1.03985
[0]	validation_0-rmse:1.17491
[500]	validation_0-rmse:1.02434
[1000]	validation_0-rmse:1.01629
[1349]	validation_0-rmse:1.01503
[0]	validation_0-rmse:1.17562
[500]	validation_0-rmse:1.02302
[1000]	validation_0-rmse:1.01699
[1175]	validation_0-rmse:1.01668
[0]	validation_0-rmse:1.17534
[500]	validation_0-rmse:1.02818
[1000]	validation_0-rmse:1.01958
[1500]	val

(0.47323803440088785, 0.018399719969737147)

### 2.2 Combining the embeddings with 5 dimontional features

In [8]:
for k, v in temp_dict.items():
    data_df = pd.merge(data_df, v, on=k, how='left')
    
new_features = [x for x in data_df.columns if x not in to_drop_columns]
for k, v in temp_dict.items():
    new_features.remove(k)

In [9]:
pet_xgboost(params, X_train=data_df, xgb_features= new_features, split_index=split_index, print_result=False)

[0]	validation_0-rmse:1.17539
[500]	validation_0-rmse:1.03430
[1000]	validation_0-rmse:1.02950
[1202]	validation_0-rmse:1.02930
[0]	validation_0-rmse:1.17550
[500]	validation_0-rmse:1.01586
[1000]	validation_0-rmse:1.00926
[1500]	validation_0-rmse:1.00618
[1651]	validation_0-rmse:1.00628
[0]	validation_0-rmse:1.17566
[500]	validation_0-rmse:1.02031
[1000]	validation_0-rmse:1.01267
[1500]	validation_0-rmse:1.01075
[1641]	validation_0-rmse:1.01144
[0]	validation_0-rmse:1.17518
[500]	validation_0-rmse:1.02710
[1000]	validation_0-rmse:1.02227
[1044]	validation_0-rmse:1.02225
[0]	validation_0-rmse:1.17532
[500]	validation_0-rmse:1.04442
[1000]	validation_0-rmse:1.03891
[1230]	validation_0-rmse:1.03876
[0]	validation_0-rmse:1.17519
[500]	validation_0-rmse:1.02697
[1000]	validation_0-rmse:1.02046
[1333]	validation_0-rmse:1.02008
[0]	validation_0-rmse:1.17576
[500]	validation_0-rmse:1.02417
[1000]	validation_0-rmse:1.01882
[1278]	validation_0-rmse:1.01875
[0]	validation_0-rmse:1.17580
[500]	va

(0.46996193606590503, 0.021446443921121338)

### 2.3 Combining the embeddings with 10 dimontional features

In [14]:
from network_setting import *
emb_szs = [
    (c, min(10, (c + 1) // 2)) for _, c in emb_c.items()
]  # embedding sizes for the chosen columns

model = PetFinderModel(emb_szs, n_cont)
model.load_state_dict(torch.load(path + "/model-stratify-10.pt"))
# model.eval()
emd_pair =  {n: pd.DataFrame(col.unique(),columns=[n]) for n, col in data_df.items() if n in cat_cols}
temp_dict = {}
for i, (k, v) in enumerate(emd_pair.items()):
    emb_vectors = pd.DataFrame(model.embeddings[0].weight.cpu().detach().numpy(), columns=[k +'_'+ str(i) for i in range(10)])
    v = pd.concat([v, emb_vectors], axis=1)
    temp_dict[k] = v
    
for k, v in temp_dict.items():
    data_df = pd.merge(data_df, v, on=k, how='left')
    
new_features_10 = [x for x in data_df.columns if x not in to_drop_columns]
for k, v in temp_dict.items():
    new_features_10.remove(k)

In [16]:
pet_xgboost(params, X_train=data_df, xgb_features= new_features_10, split_index=split_index, print_result=False)

[0]	validation_0-rmse:1.17530
[500]	validation_0-rmse:1.03140
[1000]	validation_0-rmse:1.02687
[1247]	validation_0-rmse:1.02652
[0]	validation_0-rmse:1.17516
[500]	validation_0-rmse:1.01578
[1000]	validation_0-rmse:1.00948
[1455]	validation_0-rmse:1.00815
[0]	validation_0-rmse:1.17493
[500]	validation_0-rmse:1.02111
[1000]	validation_0-rmse:1.01302
[1311]	validation_0-rmse:1.01176
[0]	validation_0-rmse:1.17495
[500]	validation_0-rmse:1.02490
[1000]	validation_0-rmse:1.01639
[1292]	validation_0-rmse:1.01454
[0]	validation_0-rmse:1.17501
[500]	validation_0-rmse:1.04494
[871]	validation_0-rmse:1.04230
[0]	validation_0-rmse:1.17501
[500]	validation_0-rmse:1.02921
[1000]	validation_0-rmse:1.02228
[1386]	validation_0-rmse:1.02048
[0]	validation_0-rmse:1.17533
[500]	validation_0-rmse:1.02457
[981]	validation_0-rmse:1.01950
[0]	validation_0-rmse:1.17527
[500]	validation_0-rmse:1.02900
[1000]	validation_0-rmse:1.02278
[1500]	validation_0-rmse:1.01807
[1649]	validation_0-rmse:1.01810
[0]	validat

(0.4753089818891361, 0.01566751316252965)