#Import Data

In [None]:
import pandas as pd 
Final_Data_S1 = pd.read_csv("/content/drive/MyDrive/Final_Data_S1_AR.csv")
Final_Data_S2 = pd.read_csv("/content/drive/MyDrive/Final_Data_S2_AR.csv")

#Class Transformation
We created a PCA-SMOTE transformation for the source and for the target datasets to transform the data into a new common feature space.

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE 
import numpy as np 
import pandas as pd 
from sklearn.decomposition import PCA
np.random.seed(1)

labels_S1 = Final_Data_S1.activity
labels_S2 = Final_Data_S2.activity
Data_S1 = Final_Data_S1.drop(['year_day', 'time', 'activity'], axis= 1)
Data_S2 = Final_Data_S2.drop(['year_day', 'time', 'activity'], axis= 1)
pca = PCA(n_components=7)
principalComponents_S1 = pca.fit_transform(Data_S1)
pca = PCA(n_components=6)
principalComponents_S2 = pca.fit_transform(Data_S2)

sm = SMOTE(random_state=1)
principalComponents_S1, labels_S1 = sm.fit_resample(principalComponents_S1, labels_S1)
principalComponents_S2, labels_S2 = sm.fit_resample(principalComponents_S2, labels_S2)

principalComponents_S1 = pd.DataFrame(principalComponents_S1)
principalComponents_S2 = pd.DataFrame(principalComponents_S2)
labels_S1 = pd.Series(labels_S1)
labels_S2 = pd.Series(labels_S2)

principalComponents_S1.columns = ['pc_Source_' + str(i) for i in range(1, 8)]
principalComponents_S2.columns = ['pc_Target_' + str(i) for i in range(1, 7)]

#Class Divergence_Calculation
We calculated the divergence between features from source and target datasets using the JSD.

In [None]:
import numpy as np 
import pandas as pd 
from scipy.spatial import distance

class Divergence_Calculation:

  def add_target(self, Final_Data, labels):
    Final_Data['activity'] = labels.values
    return Final_Data

  def prob_dist_divergence_initial(self, data, num_pc, name):

    results = data
    column = ['pc_'+ name + '_' + str(i) for i in range(1,num_pc+1)]
    res = pd.DataFrame(data = np.zeros((5,1)), columns=['None'])
    
    for i in column:
      table = pd.DataFrame(data = np.zeros((5,5)), \
                           columns= [i+'15.0', i+'60.0', i+'65.0', i+'70.0', i+'85.0'])
      condit = pd.DataFrame(data = results.groupby('activity')[i])
      for j in range(5):
        x = condit.iloc[j,1]
        table.iloc[:,j] = np.histogram(x, bins = 5)[0] / len(x)   
      res = pd.concat([res, table], axis=1)

    return res

  
  def jsd_final(self, X1_Source_init,X2_Target_init, Final_Data_Target):

    target_prob = Final_Data_Target.activity.value_counts()/len(Final_Data_Target)
    matrix = pd.DataFrame(data = np.zeros((Final_Data_Source.shape[1]-1, Final_Data_Target.shape[1]-1)))
    matrix.columns = [i for i in Final_Data_Target.columns if i!= 'activity']
    matrix.index = [i for i in Final_Data_Source.columns if i != 'activity']
      
    for i in matrix.columns:
      for j in matrix.index:
        matrix.loc[j,i] = sum([target_prob.loc[target_prob.index == k].\
          values[0]*distance.jensenshannon(X1_Source_init[j+str(k)], X2_Target_init[i+str(k)], 2.0) for k in target_prob.index])
    return matrix

#script
divergence_calculation = Divergence_Calculation()
Final_Data_Source = divergence_calculation.add_target(principalComponents_S1, labels_S1)
Final_Data_Target = divergence_calculation.add_target(principalComponents_S2, labels_S2)

X1_Source_init = divergence_calculation.prob_dist_divergence_initial(Final_Data_Source, 7, 'Source')
X2_Target_init = divergence_calculation.prob_dist_divergence_initial(Final_Data_Target, 6, 'Target')

divergence_matrix = divergence_calculation.jsd_final(X1_Source_init,X2_Target_init, Final_Data_Target)



#Class PreMapping
We used Thresholidng to avoid negative transfer and we created the Preferences lists for the mapping procedure.

In [None]:
import numpy as np 
import pandas as pd 

class PreMapping:

  def threshold_select(self, divergence_matrix, threshold):
    divergence_matrix_bool = divergence_matrix > threshold
    indx = []

    for j in range(divergence_matrix.shape[1]):
      Test = True
      for i in range(len(divergence_matrix)):
        if divergence_matrix_bool.iloc[i,j] == False:
          Test = False
      if Test == True:
        indx.append(j)

    new_indx = [i for i in range(divergence_matrix.shape[1]) if i not in indx]
    divergence_matrix = divergence_matrix.iloc[:,new_indx]
    return divergence_matrix

  def preferences(self, divergence_matrix):
    Source_features = [i for i in divergence_matrix.index]
    Target_features = [i for i in divergence_matrix.columns]

    priority_source = {i:list(divergence_matrix.loc[i,:].sort_values().index) for i in Source_features}
    priority_target = {i:list(divergence_matrix.loc[:,i].sort_values().index) for i in Target_features}

    return priority_source, priority_target

#script
preMapping = PreMapping()
#divergence_matrix = preMapping.threshold_select(divergence_matrix, 0.35)
priority_source, priority_target = preMapping.preferences(divergence_matrix)
  

#Class Mapping
We applied the Gale-Shapley Algorithm to map features from both domains based on their divergence values.

In [None]:
import numpy as np 
import pandas as pd 
from collections import defaultdict
class Mapping:

    def __init__(self, men, women):
        '''
        Constructs a Matcher instance.
        Takes a dict of men's spousal preferences, `men`,
        and a dict of women's spousal preferences, `women`.
        '''
        self.M = men
        self.W = women
        self.wives = {}
        self.pairs = []

        # we index spousal preferences at initialization 
        # to avoid expensive lookups when matching
        self.mrank = defaultdict(dict)  # `mrank[m][w]` is m's ranking of w
        self.wrank = defaultdict(dict)  # `wrank[w][m]` is w's ranking of m

        for m, prefs in men.items():
            for i, w in enumerate(prefs):
                self.mrank[m][w] = i

        for w, prefs in women.items():
            for i, m in enumerate(prefs):
                self.wrank[w][m] = i


    def __call__(self):
        return self.match()

    def prefers(self, w, m, h):
        '''Test whether w prefers m over h.'''
        return self.wrank[w][m] < self.wrank[w][h]

    def after(self, m, w):
        '''Return the woman favored by m after w.'''
        i = self.mrank[m][w] + 1    # index of woman following w in list of prefs
        return self.M[m][i]

    def match(self, men=None, next=None, wives=None):
        '''
        Try to match all men with their next preferred spouse.
        
        '''
        if men is None: 
            men = self.M.keys()         # get the complete list of men
        if next is None: 
            # if not defined, map each man to their first preference
            next = dict((m, rank[0]) for m, rank in self.M.items()) 
        if wives is None: 
            wives = {}                  # mapping from women to current spouse
        if not len(men): 
            self.pairs = [(h, w) for w, h in wives.items()]
            self.wives = wives
            return wives
        m, men = list(men)[0], list(men)[1:]
        w = next[m]                     # next woman for m to propose to
        next[m] = self.after(m, w)      # woman after w in m's list of prefs
        if w in wives:
            h = wives[w]                # current husband
            if self.prefers(w, m, h):
                men.append(h)           # husband becomes available again
                wives[w] = m            # w becomes wife of m
            else:
                men.append(m)           # m remains unmarried
        else:
            wives[w] = m                # w becomes wife of m
        return self.match(men, next, wives)

    def map_source(self, Final_Data_Source, Final_Data_Target, Final_Match):

      Final_Data_Source = Final_Data_Source.rename(columns=Final_Match)
      Final_Data_Source = Final_Data_Source[Final_Data_Target.columns]
      return Final_Data_Source

#script
mapping = Mapping(priority_target, priority_source)
Final_Match = mapping.match()
Final_Data_Target = Final_Data_Target[list(Final_Match.values()) + ['activity']]
Final_Data_Source = mapping.map_source(Final_Data_Source, Final_Data_Target, Final_Match)

#Class PostMapping
We added target data in the training to enhance the performance of the model.

In [None]:
import numpy as np 
import pandas as pd 

class PostMapping:

  def shuffle_data(self, Final_Data):
    Final_Data = Final_Data.sample(frac = 1)
    return Final_Data

  def rearrage_data(self, Final_Data_Source, Final_Data_Target, number_rows):
    add_data = Final_Data_Target.iloc[:number_rows,:]
    Final_Data_Target = Final_Data_Target.iloc[number_rows:,:]
    frames = [Final_Data_Source, add_data]
    Final_Data_Source = pd.concat(frames)

    return Final_Data_Source, Final_Data_Target


#script
postMapping = PostMapping()
Final_Data_Source = postMapping.shuffle_data(Final_Data_Source)
Final_Data_Target = postMapping.shuffle_data(Final_Data_Target)
Data_Target = Final_Data_Target
# Target data added in the training
# 0 days = 0
# 2 days = 922
# 4 days = 1844
# 6 days = 2766
# 8 days = 3688
# 10 days = 4610
Final_Data_Source, Final_Data_Target = postMapping.rearrage_data(Final_Data_Source, Final_Data_Target, 4610)


#Class Modeling

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support as score

class Modeling:

  def split_train_test(self, Final_Data_Source, Final_Data_Target):
    X_train = Final_Data_Source.drop(['activity'], axis=1)
    y_train = Final_Data_Source['activity']
    X_test = Final_Data_Target.drop(['activity'], axis=1)
    y_test = Final_Data_Target['activity']

    return X_train, y_train, X_test, y_test

  def split_train_val(self, X, y):    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_val, y_train, y_val

  def pipelines_def(self):
    pipelines = []
    params = []
    names = []
    #Notice that we tried to balance the data via using the clf__class_weight parameter in the models

    pipelines.append(Pipeline([('clf', DecisionTreeClassifier())])) ## DecisionTreeClassifier
    params.append({'clf__max_features': [None], 'clf__min_samples_split': [2], 'clf__min_samples_leaf':[1],
                  'clf__class_weight': ['balanced']})
    names.append('DecisionTreeClassifier') 



    return pipelines, params, names

  def model(self, pipeline, param, name, X, y):    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=32)

    grid_obj = GridSearchCV(estimator=pipeline, param_grid=param, cv=cv, scoring='f1_micro', n_jobs=-1)
    grid_obj.fit(X,y)  

    print(name, 'F1-measure:', grid_obj.best_score_)
    estimator = grid_obj.best_estimator_
    estimator.fit(X,y) # train on all training dataset
    return estimator 


  def estimators(self, pipelines, params, names,  X_train, y_train):
    estimators = []
    for idx in range(0,len(pipelines)):    
        estimators.append(self.model(pipelines[idx], params[idx], names[idx], X_train, y_train))
    return estimators


  def evaluate_models(self,estimators, names, X_test, y_test):
   
    for idx, estimator in enumerate(estimators):
      print('\nPerformance of', names[idx])
      y_pred = estimator.predict(X_test)       
      print('\nConfusion matrix\n', confusion_matrix(y_test, y_pred), '\n')    
      print('F1-measure', f1_score(y_test, y_pred, average='micro'), '\n')

      precision, recall, fscore, support = score(y_test, y_pred)

      print('precision: {}'.format(precision))
      print('recall: {}'.format(recall))
      print('fscore: {}'.format(fscore))
      print('support: {}'.format(support))

    return

#script
modeling = Modeling()

X_train, y_train, X_test, y_test = modeling.split_train_test(Final_Data_Source, Final_Data_Target)
X_test_T, y_test_T, X_test_T, y_test_T = modeling.split_train_test(Data_Target, Data_Target)
X_train_S,  X_test_S, y_train_S, y_test_S = modeling.split_train_val(X_test_T, y_test_T                                                                  
                                                                     )
pipelines, params, names = modeling.pipelines_def()
print('Source Training results_without TL')
estimators = modeling.estimators(pipelines, params, names, X_train_S, y_train_S)
print('Source Testing results_without TL')
modeling.evaluate_models(estimators, names, X_test_S, y_test_S)


print('Training results_TL')
estimators = modeling.estimators(pipelines, params, names, X_train, y_train)
print('Testing results_TL')
modeling.evaluate_models(estimators, names, X_test, y_test)

Source Training results_without TL
DecisionTreeClassifier F1-measure: 0.896197731687695
Source Testing results_without TL

Performance of DecisionTreeClassifier

Confusion matrix
 [[309  24  15  13   6]
 [ 14 387   1   0   2]
 [ 15  19 355   1   9]
 [ 10   1   1 361   0]
 [  2   0  63   2 327]] 

F1-measure 0.8977800722767165 

precision: [0.88285714 0.89791183 0.81609195 0.95755968 0.9505814 ]
recall: [0.84196185 0.95792079 0.88972431 0.96782842 0.82994924]
fscore: [0.86192469 0.92694611 0.85131894 0.96266667 0.88617886]
support: [367 404 399 373 394]
Training results_TL
DecisionTreeClassifier F1-measure: 0.9176190476190476
Testing results_TL

Performance of DecisionTreeClassifier

Confusion matrix
 [[327  25   3  10  11]
 [ 10 359   4   0   2]
 [ 16  14 355   2   2]
 [  8   0   0 336   0]
 [  4   0  60   1 296]] 

F1-measure 0.9067750677506775 

precision: [0.89589041 0.90201005 0.84123223 0.96275072 0.95176849]
recall: [0.86968085 0.95733333 0.9125964  0.97674419 0.8199446 ]
fscore: