In [58]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn import svm, tree, neighbors, ensemble, linear_model
from sklearn.neighbors import KNeighborsClassifier
import itertools
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import importlib
from model import *
from joblib import Parallel, delayed
import xgboost as xgb
from xgboost import XGBClassifier
os.chdir(r'C:\projects\ensemble_methods\projet_ensemble')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
# TRAIN SET
kaggle_source_cate_0_train = np.load(r'dataset/domain_adaptation/train/kaggle_source_cate_0_train.npy')
kaggle_source_cate_0_train_label = np.load(r'dataset/domain_adaptation/train/kaggle_source_cate_0_train_label.npy')
kaggle_target_cate_0_train = np.load(r'dataset/domain_adaptation/train/kaggle_target_cate_0_train.npy')

# TEST SET
kaggle_source_cate_0_test = np.load(r'dataset/domain_adaptation/test/kaggle_source_cate_0_test.npy')
kaggle_source_cate_0_test_label = np.load(r'dataset/domain_adaptation/test/kaggle_source_cate_0_test_label.npy')
kaggle_target_cate_0_test = np.load(r'dataset/domain_adaptation/test/kaggle_target_cate_0_test.npy')
kaggle_target_cate_0_test_label = np.load(r'dataset/domain_adaptation/test/kaggle_target_cate_0_test_label.npy')

In [66]:
from sklearn.metrics.pairwise import rbf_kernel

def subsample_data(X_source, X_target, n_samples=1000):
    # Ensure that the size of the subset is smaller than the number of samples
    X_source_sub = X_source[np.random.choice(X_source.shape[0], n_samples, replace=False)]
    X_target_sub = X_target[np.random.choice(X_target.shape[0], n_samples, replace=False)]
    return X_source_sub, X_target_sub

# Subsample 1000 points from both the source and target datasets
X_source_sub, X_target_sub = subsample_data(kaggle_source_cate_0_train, kaggle_target_cate_0_train, n_samples=1000)


def calculate_mmd(X_source, X_target, kernel='rbf', gamma=1.0):
    # Apply kernel functions (Radial Basis Function Kernel here)
    if kernel == 'rbf':
        K_ss = rbf_kernel(X_source, X_source, gamma=gamma)
        K_tt = rbf_kernel(X_target, X_target, gamma=gamma)
        K_st = rbf_kernel(X_source, X_target, gamma=gamma)

    # Calculate MMD
    mmd = np.mean(K_ss) + np.mean(K_tt) - 2 * np.mean(K_st)
    return mmd

mmd_score = calculate_mmd(X_source_sub, X_target_sub, gamma=1.0)
print("MMD Score between source and target:", mmd_score)


MMD Score between source and target: 0.002382934615751672


In [60]:
model = XGBClassifier(eval_metric='logloss', max_depth=3)

model.fit(kaggle_source_cate_0_train, kaggle_source_cate_0_train_label[:,1])

y_pred = model.predict(kaggle_source_cate_0_test)

f1 = f1_score(y_pred, kaggle_source_cate_0_test_label[:,1])
precision = precision_score(kaggle_source_cate_0_test_label[:,1], y_pred)
recall = recall_score(kaggle_source_cate_0_test_label[:,1], y_pred)
accuracy = balanced_accuracy_score(kaggle_source_cate_0_test_label[:,1], y_pred)

# Print the scores
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Balanced accuracy: {accuracy}")


F1 Score: 0.7326906222611743
Precision: 0.8539325842696629
Recall: 0.6415963161933999
Balanced accuracy: 0.8150241130349216


In [61]:
y_pred = model.predict(kaggle_target_cate_0_test)

f1 = f1_score(y_pred, kaggle_target_cate_0_test_label[:,1])
precision = precision_score(kaggle_target_cate_0_test_label[:,1], y_pred)
recall = recall_score(kaggle_target_cate_0_test_label[:,1], y_pred)
accuracy = balanced_accuracy_score(kaggle_target_cate_0_test_label[:,1], y_pred)

# Print the scores
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Balanced accuracy: {accuracy}")


F1 Score: 0.6330311391407174
Precision: 0.6816638370118846
Recall: 0.5908756438557763
Balanced accuracy: 0.7858573201650758


In [62]:
pseudo_labels = model.predict(kaggle_target_cate_0_train)


pseudo_probs = model.predict_proba(kaggle_target_cate_0_train)
confidence_threshold = 0.4
pseudo_labels = np.where(pseudo_probs.max(axis=1) > confidence_threshold, pseudo_probs.argmax(axis=1), -1)

# Filter to only use confident pseudo-labels
high_confidence_mask = pseudo_labels != -1
X_target_filtered = kaggle_target_cate_0_train[high_confidence_mask]
pseudo_labels_filtered = pseudo_labels[high_confidence_mask]


In [63]:
# Combine source data and filtered pseudo-labeled target data
X_combined_train = np.vstack((kaggle_source_cate_0_train, X_target_filtered))
y_combined_train = np.hstack((kaggle_source_cate_0_train_label[:,1], pseudo_labels_filtered))

# Check the shapes
print("Shape of X_combined_train:", X_combined_train.shape)
print("Shape of y_combined_train:", y_combined_train.shape)

# Fine-tune the model with early stopping
model.fit(X_combined_train, y_combined_train, verbose=True)


Shape of X_combined_train: (103846, 51)
Shape of y_combined_train: (103846,)


In [64]:
# Evaluate the model on the target test set
target_test_scores = evaluate_model(model, X_combined_train, y_combined_train, kaggle_target_cate_0_test, kaggle_target_cate_0_test_label[:,1])

# Print performance on the target domain
print("Target Test Scores:", target_test_scores)


Fitting the model...
Model fit complete. Predicting...
F1 Score: 0.6347457627118644
Precision: 0.7482517482517482
Recall: 0.551140544518028
Balanced accuracy: 0.7691321750744041
Target Test Scores: {'F1 Score': 0.6347457627118644, 'Precision': 0.7482517482517482, 'Recall': 0.551140544518028, 'Balanced accuracy': 0.7691321750744041}
