In [1]:
import numpy as np
import load_data
import scipy
import time
import pandas as pd
import warnings
import os
from joblib import dump, load
warnings.filterwarnings("ignore")

###### read data

In [23]:
map_label = load_data.read_map(load_data.test_map_path)
train_data, train_label = load_data.read_data(load_data.train_data_path, load_data.train_label_path)
test_data, test_label = load_data.read_data(load_data.test_data_path, load_data.test_label_path)
words_list = load_data.read_words(load_data.vocabulary_path)

###### combine train data and test data together

In [24]:
def combine(train_data, train_label, test_data, test_label):
    ## combine the train data and test data together
    train_data = np.asarray(train_data, dtype=object)
    test_data = np.asarray(test_data, dtype=object)
    data = np.concatenate([train_data, test_data])
    label = np.concatenate([train_label, test_label])
    return data, label
    

def select_data(data, label, source_domain, target_domain):
    source_loc = []
    target_loc = []
    for i in range(len(source_domain)):
        source_loc_temp = list()
        for j in range(len(source_domain[i])):
            for key, value in map_label.items():
                if value == source_domain[i][j]:
                    source_loc_temp.append(key)
                    break
        source_loc.append(source_loc_temp) 
        
        target_loc_temp = list()
        for j in range(len(target_domain[i])):
            for key, value in map_label.items():
                if value == target_domain[i][j]:
                    target_loc_temp.append(key)
                    break
        target_loc.append(target_loc_temp)

    source_data = np.empty(shape=(0,), dtype=np.object)
    source_label = np.empty(shape=(0,), dtype=np.int32)
    target_data = np.empty(shape=(0,), dtype=np.object)
    target_label = np.empty(shape=(0,), dtype=np.int32)
    for i, loc_list in enumerate(source_loc):
        for loc in loc_list:
            d = data[label==loc]
            l = np.ones(shape=(len(d))) * i
            source_data = np.concatenate([source_data, d])
            source_label = np.concatenate([source_label, l])
    for i, loc_list in enumerate(target_loc):
        for loc in loc_list:
            d = data[label==loc]
            l = np.ones(shape=(len(d))) * i
            target_data = np.concatenate([target_data, d])
            target_label = np.concatenate([target_label, l])        
    return source_data, target_data, source_label, target_label

data, label = combine(train_data, train_label, test_data, test_label)

###### define statistic function

In [4]:
import os

def compare_prediction(prediction, label, class_number=20):
    ratio_list = list()
    for i in range(class_number):
        i_label_index = label==i
        class_number = np.sum(i_label_index)
        correct_prediction = np.sum(prediction[i_label_index] == i)
        ratio = correct_prediction / class_number
        ratio_list.append(ratio)
    return np.asarray(ratio_list)


def save_to_file(method_name, ratio_list, accuracy, domain_name, filename="save.xlsx"):
    if os.path.exists(filename):
        df = pd.read_excel(filename)
    else:
        df = pd.DataFrame(columns=["method", *domain_name, "sum"])
    df.loc[len(df)] = [method_name, *ratio_list, accuracy]
    df.to_excel(filename, index=False)

In [25]:
domain_name = ["comp", "rec", "sci", "talk"]

source_domain = [["comp.graphics","comp.os.ms-windows.misc"], 
                 ["rec.autos","rec.motorcycles"], 
                 ["sci.crypt","sci.electronics"], 
                 ["talk.politics.guns","talk.politics.mideast"]]
target_domain = [["comp.sys.ibm.pc.hardware","comp.sys.mac.hardware","comp.windows.x"], 
                 ["rec.sport.baseball","rec.sport.hockey"],
                 ["sci.med", "sci.space"],
                 ["talk.politics.misc", "talk.religion.misc"]]

source_data, target_data, source_label, target_label = select_data(data, label, source_domain, target_domain)
# target_data, target_label = select_data(data, label, target_domain)

#### Test on SA - SVM

since this model is inductive, the feature domain of the source and target data domain are not the same. We can not use SA. But however, I will first try this method, and the try to change it.

In [None]:
from adapt.feature_based import SA
from sklearn.svm import SVC


def SVM_SSL(name, **kwargs):
    """
    test for one SA SVM method
    """
    source_data_matrix = load_data.to_scipy_sparse_matrix(source_data, len(words_list)+1)
    target_data_matrix = load_data.to_scipy_sparse_matrix(target_data, len(words_list)+1)
    
    svc = SVC(**kwargs)
    model_path = "model/TL/"+name+".joblib"
    if os.path.exists(model_path):
        print("load " + name + " model")
        svc = load(model_path)
        print("loading complete")
    else:
        print("train " + name + " model")
        svc.fit(source_data_matrix, source_label)
        dump(svc, model_path) 
        print("training complete")
    
    target_prediction = svc.predict(target_data_matrix)
    target_accuracy = np.sum(target_prediction == target_label) / len(target_label)
    print("accuracy on target domain is: {:.3f}".format(target_accuracy))
    ratio_list = compare_prediction(target_prediction, target_label, len(target_domain))
    save_to_file(name, ratio_list, target_accuracy, domain_name, "TL_none.xlsx")
    
    source_data_matrix = source_data_matrix.toarray()
    target_data_matrix = target_data_matrix.toarray()
    
    model = SA(SVC(**kwargs), Xt=target_data_matrix, random_state=0)
    model_path = "model/TL/TL_"+name+".joblib"
    model.fit(source_data_matrix, source_label)
    print("this model can't be save, it has 10 GB")
        
    # model.fit(source_data_matrix, source_label)
    target_prediction = model.predict(target_data_matrix)
    target_accuracy = np.sum(target_prediction == target_label) / len(target_label)
    print("accuracy on target domain is for transfer learing SA: {:.3f}".format(target_accuracy))
    ratio_list = compare_prediction(target_prediction, target_label, len(target_domain))
    save_to_file(name, ratio_list, target_accuracy, domain_name, "TL_sa.xlsx")

SVM_SSL("SA_SVM_linear", kernel="linear")

#### feature analyse

In [26]:
def feature_analyse():
    source_data_matrix = load_data.to_scipy_sparse_matrix(source_data, len(words_list)+1)
    target_data_matrix = load_data.to_scipy_sparse_matrix(target_data, len(words_list)+1)
    source_data_matrix = source_data_matrix.toarray()
    target_data_matrix = target_data_matrix.toarray()
    print("there are total {} features for each data".format(len(source_data_matrix[0])))
    
    feature_source = np.sum(source_data_matrix, axis=0) >= 1
    feature_target = np.sum(target_data_matrix, axis=0) >= 1
    print("{} features are used in the source domain, and {} features are used in the "
         "target domain".format(np.sum(feature_source), np.sum(feature_target)))
    feature_common = np.bitwise_and(feature_source, feature_target)
    print("{} features are used for all the domain".format(np.sum(feature_common)))
    feature_common = np.bitwise_or(feature_source, feature_target)
    print("{} features are used in at least one domain".format(np.sum(feature_common)))
    
    feature_source = np.sum(source_data_matrix, axis=0) >= 50
    feature_target = np.sum(target_data_matrix, axis=0) >= 50
    print("when n >= 50, {} features are used in the source domain, and {} features are used in the "
         "target domain".format(np.sum(feature_source), np.sum(feature_target)))
    feature_common = np.bitwise_or(feature_source, feature_target)
    print("when n >= 50, {} features are used in at least one domain".format(np.sum(feature_common)))
    
feature_analyse()

there are total 61189 features for each data
41799 features are used in the source domain, and 43969 features are used in the target domain
27364 features are used for all the domain
58404 features are used in at least one domain
when n >= 50, 3963 features are used in the source domain, and 4118 features are used in the target domain
when n >= 50, 5212 features are used in at least one domain


#### pick up the common domain

In [27]:
from scipy import sparse

def common_feature(minimal_number):
    source_data_matrix = load_data.to_scipy_sparse_matrix(source_data, len(words_list)+1)
    target_data_matrix = load_data.to_scipy_sparse_matrix(target_data, len(words_list)+1)
    source_data_matrix = source_data_matrix.toarray()
    target_data_matrix = target_data_matrix.toarray()
    # pick up the features that commonly used in source domain and target domain
    feature_source = np.sum(source_data_matrix, axis=0) >= minimal_number
    feature_target = np.sum(target_data_matrix, axis=0) >= minimal_number
    feature_common = np.bitwise_or(feature_source, feature_target)
    
    feature_one = np.bitwise_and(np.sum(source_data_matrix, axis=0) >= 1, 
                                 np.sum(target_data_matrix, axis=0) >= 1)
    feature_common = np.bitwise_and(feature_common, feature_one)
    
    common_source_data = source_data_matrix[:, feature_common]
    common_target_data = target_data_matrix[:, feature_common]
    return common_source_data, common_target_data
    
common_source_data, common_target_data = common_feature(50)

#### separate the target domain data into training and test

In [28]:
def separate():
    print("this code can only be runned once!")
    length = common_target_data.shape[0]
    index = np.random.permutation(length)
    index_train = index[:length//2]
    index_test = index[length//2:]
    data_train = common_target_data[index_train]
    data_test = common_target_data[index_test]
    label_train = target_label[index_train]
    label_test = target_label[index_test]
    return data_train, data_test, label_train, label_test

common_target_data, common_target_data_test, target_label, target_label_test = separate()

In [13]:
common_source_data.shape

(7735, 4929)

In [14]:
common_target_data.shape

(4140, 4929)

#### Test TL method

In [17]:
from adapt.feature_based import SA
from adapt.instance_based import IWC
from adapt.instance_based import TrAdaBoost
from scipy import sparse

def none_TL(name, model):
    """
    test for one SA SVM method
    """
    common_source_data_sparse = sparse.csr_matrix(common_source_data)
    model.fit(common_source_data_sparse, source_label)
    target_prediction = model.predict(common_target_data_test)
    target_accuracy = np.sum(target_prediction == target_label_test) / len(target_label_test)
    print("accuracy on target domain is: {:.3f}".format(target_accuracy))
    ratio_list = compare_prediction(target_prediction, target_label_test, len(target_domain))
    save_to_file(name, ratio_list, target_accuracy, domain_name, "TL_none.xlsx")

def SA_method(name, model):
    model = SA(model, Xt=common_target_data, yt=target_label, random_state=0)
    model.fit(common_source_data, source_label)
    target_prediction = model.predict(common_target_data_test)
    target_accuracy = np.sum(target_prediction == target_label_test) / len(target_label_test)
    print("accuracy on target domain is for transfer learing SA: {:.3f}".format(target_accuracy))
    ratio_list = compare_prediction(target_prediction, target_label_test, len(target_domain))
    save_to_file(name, ratio_list, target_accuracy, domain_name, "TL_sa.xlsx")

def Boost_method(name, model):
    model = TrAdaBoost(model, Xt=common_target_data, yt=target_label,n_estimators=5,random_state=0)
    model.fit(common_source_data, source_label)
    target_prediction = model.predict(common_target_data_test)
    target_accuracy = np.sum(target_prediction == target_label_test) / len(target_label_test)
    print("accuracy on target domain is for transfer learing SA: {:.3f}".format(target_accuracy))
    ratio_list = compare_prediction(target_prediction, target_label_test, len(target_domain))
    save_to_file(name, ratio_list, target_accuracy, domain_name, "TL_boost.xlsx")


# none_TL("SA_SVM_linear", model=SVC(kernel="linear"))
# SA_method("SA_SVM_linear", model=SVC(kernel="linear"))
# Boost_method("SA_SVM_linear", model=SVC(kernel="linear"))

In [16]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


def start_TL_test(method):
    try:
        method("SVM_linear", model=SVC(kernel="linear"))
    except Exception as e:
        print("error happened when execute SVM_linear: ", str(e))

    try:
        method("MLR_OVR", model=LogisticRegression(multi_class='ovr', max_iter=500))
    except Exception as e:
        print("error happened when execute MLR_OVR: ", str(e))
    
    try:
        method("Multinomial_Naive_Bayes", model=MultinomialNB())
    except Exception as e:
        print("error happened when execute Multinomial_Naive_Bayes: ", str(e))
    
    try:
        method("decision_tree", model=DecisionTreeClassifier())
    except Exception as e:
        print("error happened when execute decision_tree: ", str(e))

    try:
        method("random_forest", model=RandomForestClassifier())
    except Exception as e:
        print("error happened when execute random_forest: ", str(e))
    


In [171]:
print("Mind that all the transfer learning model is extremely huge, I can't save those model")
print("start none TL: ")
start_TL_test(none_TL)
print("start SA: ")
start_TL_test(SA_method)
print("start TrAdaBoost: ")
start_TL_test(Boost_method)

start none TL: 
accuracy on target domain is: 0.522
accuracy on target domain is: 0.540
accuracy on target domain is: 0.670
accuracy on target domain is: 0.395
accuracy on target domain is: 0.537
start SA: 
Fit transform...
Fit Estimator...
accuracy on target domain is for transfer learing SA: 0.489
Fit transform...
Fit Estimator...
accuracy on target domain is for transfer learing SA: 0.504
Fit transform...
Fit Estimator...
error happened when execute Multinomial_Naive_Bayes:  Negative values in data passed to MultinomialNB (input X)
Fit transform...
Fit Estimator...
accuracy on target domain is for transfer learing SA: 0.302
Fit transform...
Fit Estimator...
accuracy on target domain is for transfer learing SA: 0.342
start TrAdaBoost: 
Iteration 0 - Error: 0.0010
Iteration 1 - Error: 0.0005
Iteration 2 - Error: 0.0002
Iteration 3 - Error: 0.0004
Iteration 4 - Error: 0.0004
accuracy on target domain is for transfer learing SA: 0.861
Iteration 0 - Error: 0.0514
Iteration 1 - Error: 0.0

#### Test on the training and test set

In [30]:
from sklearn.naive_bayes import MultinomialNB

def Boost_Bayes():
    model = TrAdaBoost(MultinomialNB(), Xt=common_target_data, yt=target_label,n_estimators=5,random_state=0)
    name = "Final_Tr_Ada_Boost"
    model_path = "model/TL/"+name+".joblib"
    if os.path.exists(model_path):
        print("load " + name + " model")
        model = load(model_path)
        print("loading complete")
    else:
        print("train " + name + " model")
        model.fit(common_source_data, source_label)
        dump(model, model_path) 
        print("training complete")
        
    # model.fit(common_source_data, source_label)
    
    source_prediction = model.predict(common_source_data)
    source_accuracy = np.sum(source_prediction == source_label) / len(source_label)
    print("accuracy on source domain is : {:.3f}".format(source_accuracy))
    target_prediction = model.predict(common_target_data)
    target_accuracy = np.sum(target_prediction == target_label) / len(target_label)
    print("accuracy on target domain is : {:.3f}".format(target_accuracy))
    print("Est = {:.3f}".format(2-source_accuracy-target_accuracy))
    
Boost_Bayes()

load Final_Tr_Ada_Boost model
setting
loading complete
accuracy on source domain is : 0.796
accuracy on target domain is : 0.983
Est = 0.221


#### error bound

In [28]:
import math

alpha = 0.5
beta = 0.651
N = 7735+4140+4140
dvc = 2
theta = 0.1

error_bound = 2*(1-0.5)*(0.255+0.5*0.1) + 4*math.sqrt(alpha*alpha/beta + (1-alpha)**2/(1-beta))*\
        math.sqrt(2/N*dvc*np.log(2*N+2)+2/N*np.log(8/theta))

print("error bound is: {:.3f}".format(error_bound))

error bound is: 0.540
