In [2]:
import pandas as pd
import numpy as np
import ast
import scipy.stats as ss

# Freelancer

#pd.option_context('display.max_rows', None, 'display.max_columns', None)  # more options can be specified also
desired_width=5000
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)


def get_popularity(worker_df, task_list, split=(0.2, 0.8)):

    column = worker_df['skills'].to_list()
    pd.set_option("display.max_rows", None, "display.max_columns", None)

    frequencies = dict.fromkeys(task_list, 0)

    for row in column:
        # Converting string to list
        row = ast.literal_eval(row)
        for skill in row:
            frequencies[skill] += 1

    df_freq = pd.DataFrame(frequencies.items(), columns=['skill', 'freq'])
    df_freq = df_freq.sort_values('freq')
    # print(df_freq)

    rare, common, popular = np.split(df_freq, [int(split[0] * len(df_freq)), int(split[1] * len(df_freq))])

    # print('popular', popular, len(popular))
    # print('common', common, len(common))
    # print('rare', rare, len(rare))

    return rare, common, popular


# main
task_df = pd.read_csv(r'C:\Users\Konstantina\Desktop\HDrop20-master-thesis-experiments\data\data_files\real_data\test_real_dir\instance_1\tasks_df_freelancer_complete.csv')
worker_df = pd.read_csv(r'C:\Users\Konstantina\Desktop\HDrop20-master-thesis-experiments\data\data_files\real_data\test_real_dir\instance_1\workers_df_freelancer_complete.csv')
task_list = task_df['skill'].to_list()
split = (0.2, 0.8)  # arr[0:0.2] arr[0.2:0.8] arr[0.8:1]
rare, common, popular = get_popularity(worker_df, task_list)


def get_random_tasks(n, rare, common, popular):

    random_rare_tasks = rare.sample(n=int(n*0.2))
    random_common_tasks = common.sample(n=int(n * 0.6))
    random_popular_tasks = popular.sample(n=int(n * 0.2))
    print(random_rare_tasks)
    print(random_common_tasks)
    print(random_popular_tasks)

    frames = [random_rare_tasks, random_common_tasks, random_popular_tasks]
    result = pd.concat(frames)

    print(result)
    return result


# main
n_random_tasks = get_random_tasks(10, rare, common, popular)
print(n_random_tasks)

# selecting the workers that are included when the n_random_tasks are picked
# then, m of them will be selected (in the same for loop the outcome will be 10 instances)
def select_workers(n_random_tasks, worker_df):

    sum_of_workers = len(worker_df)
    sum_cost_of_all = 0

    for index, row in worker_df.iterrows():
        sum_cost_of_all += row['cost']

    # print('Number of workers is:', sum_of_workers)
    # print('The average cost of this dataset is:', sum_cost_of_all / sum_of_workers)

    skills_list = n_random_tasks['skill'].to_list()
    # print(skills_list)

    # -----------------Skills---------------------

    sum_skills_of_all = 0
    for index, row in worker_df.iterrows():
        skills_count = len([i for i in skills_list if i in row['skills']])
        sum_skills_of_all += skills_count

    # print('The average count of skills of this instance of the dataset is:', sum_skills_of_all / sum_of_workers)

    v_average = (sum_cost_of_all / sum_of_workers) / (sum_skills_of_all / sum_of_workers)
    # print('The average value must be greater than', v_average)

    # Discrete Norm-------------------------------------

    workers_df = worker_df.reset_index()

    sum_of_included_workers = 0
    sum_of_workers_that_have_the_task = 0
    included_workers = pd.DataFrame()

    supremum = 2 * v_average
    x = np.arange(1, supremum)
    xU, xL = x + 0.5, x - 0.5
    loc = (supremum + 1) / 2
    prob = ss.norm.cdf(xU, loc, scale=(supremum + 1 - loc) / 3) - ss.norm.cdf(xL, loc,
                                                                              scale=(supremum + 1 - loc) / 3)
    prob = prob / prob.sum()  # normalize the probabilities so their sum is 1

    for index, row in workers_df.iterrows():
        # the number of the needed skills that this worker has
        skills_count = len([i for i in skills_list if i in row['skills']])

        # random value
        nums = np.random.choice(x, size=1, p=prob)
        if nums[0] == 0:
            value = 1
        else:
            value = nums[0]

        if skills_count * value > row['cost']:
            sum_of_included_workers += 1
            included_workers = included_workers.append(row, ignore_index=True)
            #print(row)

        if skills_count >= 1:
            sum_of_workers_that_have_the_task += 1

    print('Sum of workers that have at least on of these', len(n_random_tasks), 'tasks:', sum_of_workers_that_have_the_task, 'out of',
          len(workers_df), 'workers')
    print('Sum of workers that got included:', sum_of_included_workers, 'That is',
          sum_of_included_workers / sum_of_workers_that_have_the_task * 100, '%')

    #print(included_workers)
    included_workers = included_workers.drop(columns=['index'])
    return included_workers


# main
included_workers = pd.DataFrame()
included_workers = select_workers(n_random_tasks, worker_df)
#print(included_workers)


def create_instances(included_workers, m_workers, k_instances):

    for i in range(k_instances):

        df = included_workers.sample(n=m_workers, replace=True)
        print('df number:', i)
        print(df)


create_instances(included_workers, 5, 10)

       skill  freq
142  swedish     1
106   oracle     2
                  skill  freq
73                greek     6
58          electronics    17
167  website management    18
4               3ds max    20
111         photography     6
123              python    20
              skill  freq
49  data processing    61
91            linux    74
                  skill  freq
142             swedish     1
106              oracle     2
73                greek     6
58          electronics    17
167  website management    18
4               3ds max    20
111         photography     6
123              python    20
49      data processing    61
91                linux    74
                  skill  freq
142             swedish     1
106              oracle     2
73                greek     6
58          electronics    17
167  website management    18
4               3ds max    20
111         photography     6
123              python    20
49      data processing    61
91                linux  