In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import pairwise_distances
from scipy.stats import wasserstein_distance
import time
from multiprocessing import Process, Queue

'''

Excerpt from the original confidential repository

'''

def execute_sds_and_ntm(training_set, source_domain_list, ntm_method):

    '''
    chooses the best fitting sourcedomains according to the wasserstein metrik and weights the sampels with the choosen ntm methode

    :param training_set: dataframe including all domains/ column 'source_domain' contains the informations which source domain the sample comes from

    :param source_domain_list: list of all 'source_domain' names 
    
    :param ntm_method: method to calculate sample weights must be "kmm" or "tradaboost"

    :return: training set and sample weights for model application 
    '''

    # set target domain

    df_tar = training_set[training_set['source_domain'] == "target"]

    # get best fitting domains and calculate sample weights

    if len(df_tar) != 0:
                
        ### get best fitting source domains ###
        
        was_list__source_domain = []

        # calculate wasserstein distance for each market combination

        for i in source_domain_list:
            df_src = training_set[training_set['source_domain'] == i]
            df_src.fillna(0, inplace=True)

            if len(df_src) == 0:
                was = np.nan
            else: 
                was = wasserstein_distance(df_src["rv"],df_tar["rv"])

            was_list__source_domain.append(was)
    
        was_list_len = []

        # get lenth of each market

        for i in source_domain_list:

            df_src = training_set[training_set['source_domain'] == i]
            df_src.fillna(0, inplace=True)

            was_list_len.append(len(df_src))

        # put results into a df and calculate weight

        d = {'was': was_list__source_domain, 'len': was_list_len, 'source_domain': source_domain_list, 'weight': was_list__source_domain * (was_list_len-np.sum(was_list_len))*-1}
        df_was_markets = pd.DataFrame(data=d)
        df_was_markets = df_was_markets.sort_values(by=['weight'])
        df_was_markets.fillna(0, inplace=True)
        print(df_was_markets)

        # get source domains with min weights distance to target market

        df_src = pd.DataFrame()

        another_market = 1
        
        while another_market <= 4 and df_was_markets.iloc[another_market]['weight'] < df_was_markets.iloc[1]['weight']*1.2:
            df_src_2nd_market = training_set[training_set['source_domain'] == df_was_markets.iloc[another_market]['source_domain']]
            df_src = pd.concat([df_src, df_src_2nd_market])
            print("Another source domain was added: "+ df_was_markets.iloc[another_market]['source_domain'])
            another_market += 1
        
        # calculate weights for each sample and set market as source dataset
        
        training_set = pd.concat([df_src, df_tar]) # overwrite with additional source domains that fit target market
        training_set.fillna(0, inplace=True)

        if ntm_method == "kmm":

            kmm_weighting(training_set)

        elif ntm_method == "tradaboost":

            beta = tradaboost_weighting(df_src, df_tar)

        else:
            print("Please choose a sample weighting algorithm")

    else:

        # if target is 0 weights choosen standard weights

        beta = np.array ( [1]*len(training_set) ) # all weights are the same

    return training_set, beta

def kmm_weighting(training_set):

    '''
    Calculate sample weights with KMM

    :param training_set: dataframe including all target and source domain sampels

    :return beta: sampel weight the length of src+tar
    '''


    try: 
        kmm_1 = kmm.KMM()

        q = Queue()
        
        # Start process from multiprocesses

        p = Process(target=kmm_1.fit, args=(training_set._get_numeric_data(), df_tar._get_numeric_data(), 1, q))
        p.start()

        # Wait 2 seconds for the process to test wrather 2 iterations need less than 2 seconds

        time.sleep(2)

        # Check if process is still running

        if p.is_alive():

            print ("process is still running, it will be terminated now")

            # Terminate process

            p.terminate()
            p.join()

            # set standard weights by using the cosine distance as backup

            matrix = pairwise_distances(training_set._get_numeric_data(),df_tar._get_numeric_data(), metric='cosine')
            df_matrix = pd.DataFrame(matrix)
            df_matrix = df_matrix.mean(axis=1)
            beta = df_matrix

        else:

            # get return value from whole process with 100 iterations

            p = Process(target=kmm_1.fit, args=(training_set._get_numeric_data(), df_tar._get_numeric_data(), 100, q))
            p.start()
            beta = q.get()
            beta = beta.clip(min=0).flatten()
        
    except MemoryError as err: 
        print("Error: ", err)
        beta = np.array ( [1]*len(training_set) )
        print("Beta is assigned",beta)
        pass

    return beta

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(df.drop(['views','likes','dislikes','comment_count'], axis=1), df['likes'], test_size=0.2, random_state=42)

# Get weights
execute_sds_and_ntm(training_set= , source_domain_list= , ntm_method= )

# Train a decision tree regressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

# Predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
rmse = np.sqrt ( mean_squared_error(y_test, y_pred) )
print('Root Mean Squared Error:', rmse)