# Scrape sources to build dataframe
Here we take the html that we have crawled from online websites and scrape them to extract the data that we are interested in.

We then store this data in a dataframe that we then save to ``

In [1]:
import sys
sys.path.append('../src/')

In [2]:
import pandas as pd
import utils

# Load opfunu functions

In [3]:
from opfunu.dimension_based import benchmark1d, benchmark2d, benchmark3d, benchmarknd
from opfunu.type_based import multi_modal, uni_modal
import inspect

In [4]:
classes = [
    benchmark1d, benchmark2d, benchmark3d, benchmarknd, # total methods = 62
    # multi_modal, uni_modal, # total methods = 47
]

In [5]:
def cls2name(cls):
    module = cls.Functions.__module__
    name = module.split('.')[-1]
    return name

def cls2methods(cls):
    return list(dict(inspect.getmembers(cls.Functions, predicate=inspect.isfunction)).keys())

def build_cls_df(classes):
    rows = []
    for cls in classes:
        name = cls2name(cls)
        methods = cls2methods(cls)
        for method_name in methods:
            clean_name = method_name.replace('_', ' ').strip()
            clean_name = ' '.join([ cn.capitalize() for cn in clean_name.split(' ') ])
            rows.append(dict(
                #cls=name,
                name=clean_name,
                #method=method_name,
                call=f'{name}.Functions().{method_name}',
            ))
    df = pd.DataFrame(rows)
    df = df.sort_values('name')
    df = df.reset_index(drop=True)
    return df

In [6]:
df = build_cls_df(classes)
df

Unnamed: 0,name,call
0,Ackley,benchmarknd.Functions()._ackley__
1,Ackley N2,benchmark2d.Functions()._ackley_n2__
2,Ackley N3,benchmark2d.Functions()._ackley_n3__
3,Ackley N4,benchmarknd.Functions()._ackley_n4__
4,Adjiman,benchmark2d.Functions()._adjiman__
...,...,...
58,Xin She Yang,benchmarknd.Functions()._xin_she_yang__
59,Xin She Yang N2,benchmarknd.Functions()._xin_she_yang_n2__
60,Xin She Yang N3,benchmarknd.Functions()._xin_she_yang_n3__
61,Xin She Yang N4,benchmarknd.Functions()._xin_she_yang_n4__


# Load Scraped Data

In [7]:
from scrapers import collector

source2df = collector.scrape()

# from collectors import sfu, infinity77, benchmarkfcns
# import json


# Map opfunu benchmarks to sources

In [8]:
# Load opfunu functions
classes = [
    benchmark1d, benchmark2d, benchmark3d, benchmarknd, # total methods = 62
    # multi_modal, uni_modal, # total methods = 47
]
df = build_cls_df(classes)

# Map opfunu benchmarks to sources
for source_id, source_df in source2df.items():
    # create mapping df
    source_map, *_ = utils.diff_map(df.name, source_df.name)
    source_map_df = pd.DataFrame(source_map.items(), columns=['name', source_id])
    # merge
    df = pd.merge(left=df, right=source_map_df, on='name', how='outer')

df.to_csv('../assets/build_db/df.csv', index=False)

# select which sources are superior

In [9]:
# MAYBE TODO
# rows = []
# for source_id, source_df in sources.items():
#     print(source_id)
#     row = {'source': source_id}
#     for column in source_df.columns:
#         row[column] = True
#     rows.append(row)
# pd.DataFrame(rows)

# Verify Mapping

In [10]:
# df.to_csv('df.csv', index=False)

In [11]:
# open csv file and manually remove names from sources that are wrong

In [12]:
#df = pd.read_csv('df.csv')