In [None]:
%matplotlib inline
import pandas as pd
import json
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

In [None]:
# path_to_extracted_data = '../results/extracted_data/dataset_results.json'
path_to_extracted_data_v2 = '../results/extracted_data/dataset_results_v2.json'

In [None]:
with open(path_to_extracted_data_v2) as input_json:
    raw_data_v2 = json.load(input_json)




In [None]:

print(len(raw_data_v2))

In [None]:
import importlib
import inspect
def get_classes_names_to_module_names(package_name, modules_list):
    cls_names_to_full_module_names = {}
    for module_name in modules_list:
        full_module_name = '.'.join([package_name, module_name])
        module = importlib.import_module(full_module_name)
        for class_name, obj in inspect.getmembers(module, inspect.isclass):
            cls_names_to_full_module_names[class_name] = full_module_name
    return cls_names_to_full_module_names


SKLEARN_PACKAGE_NAME = 'sklearn'
SKLEARN_MODULES_WITH_MODELS = ['cluster', 'discriminant_analysis', 'ensemble',
                               'kernel_ridge', 'linear_model', 'naive_bayes',
                               'neighbors', 'neural_network', 'semi_supervised',
                               'svm', 'tree']

MODEL_NAMES_TO_MODULE_NAMES = get_classes_names_to_module_names(SKLEARN_PACKAGE_NAME, SKLEARN_MODULES_WITH_MODELS)
ML_MODELS_LIST = list(MODEL_NAMES_TO_MODULE_NAMES.keys())

In [None]:
def only_sklearn_hyperparams(ml_model_name: str, hyperparams):

    module_name = MODEL_NAMES_TO_MODULE_NAMES[ml_model_name]
    module = importlib.import_module(module_name)

    ModelClass = getattr(module, ml_model_name)

    sklearn_hyperparams_set = set(inspect.signature(ModelClass.__init__).parameters.keys())

    return dict(filter(lambda elem: elem[0] in sklearn_hyperparams_set, hyperparams.items()))

In [None]:
ml_model_name = 'LogisticRegression'

In [None]:
samples = []
for entry in raw_data_v2:
    if entry['model']['model_name'] == ml_model_name:
        sample = entry['model']['params']
        filtered_sample = only_sklearn_hyperparams(ml_model_name, sample)
        samples.append(filtered_sample)

In [None]:
df = pd.DataFrame(samples)
print(df.head(10))

In [None]:
print(pd.to_numeric(df['C']).dropna())

In [None]:
numeric_C = pd.to_numeric(df['C']).fillna(1.0)



In [None]:
numeric_C.describe()

In [None]:
Q1 = numeric_C.quantile(0.025)
Q3 = numeric_C.quantile(0.975)
IQR = Q3 - Q1

In [None]:
numeric_C

In [None]:
filtered_C = numeric_C.where(lambda x: ~((x < (Q1 - 1.5 * IQR)) | (x > (Q3 + 1.5 * IQR)))).dropna()

In [None]:
pd.set_option('float_format', '{:f}'.format)
filtered_C.describe()

In [None]:
filtered_C.value_counts()

In [None]:
def hist_plot_on_interval(values, interval=None):
    if interval is not None:
        values_inside_interval = values.where(lambda x: (x >= interval[0]) & (x <= interval[1])).dropna()
    else:
        values_inside_interval = values
    return sns.histplot(values_inside_interval, bins='doane', kde=True)
    

In [None]:
hist_plot_on_interval(filtered_C, (0., 10.))





In [None]:
module_name = MODEL_NAMES_TO_MODULE_NAMES[ml_model_name]
module = importlib.import_module(module_name)
ModelClass = getattr(module, ml_model_name)

sign = inspect.signature(ModelClass.__init__)


In [None]:
sign.parameters['C'].default

In [None]:
CODE_CELL_HEADER = '#%%'
MD_CELL_HEADER = '#%% md'
RAW_CELL_HEADER = '#%% raw'

def generate_notebook_txt(df, df_row_idx, up_to_cell_number=None, include_non_code_cells=True):
    row = df[df['id'] == df_row_idx]
    repo_id = row['repository_id'].values[0]
    notebook_id = row['notebook_id'].values[0]
    
    only_notebook_cells = df[df['notebook_id'] == notebook_id]
    if not include_non_code_cells:
        only_notebook_cells = only_notebook_cells[only_notebook_cells['cell_type'] == 'code']
    
    result = ''
    for idx, record in only_notebook_cells.sort_values(by=['index']).iterrows():
        record_cell_type = record['cell_type']
        if record_cell_type == 'code':
            result += CODE_CELL_HEADER + '\n'
        elif record_cell_type == 'markdown':
            result += MD_CELL_HEADER + '\n'
        elif record_cell_type == 'raw':
            result += RAW_CELL_HEADER + '\n'
        
        else:
            raise RuntimeError(f'Unknown cell type: {record_cell_type}')
        
        result += record['source'] + '\n'
    
    return result
    

In [None]:

path_to_data = '../../dataset/sklearn_full_cells.csv'

In [None]:
df = pd.read_csv(path_to_data, nrows=500000)

In [None]:
df.head(15)


In [None]:
src23191647 = generate_notebook_txt(df,df_row_idx=23191647)

In [None]:
import os
def save_jupyter_notebook_txt(src, notebook_id, folder):
    fname = '.'.join((str(notebook_id), 'txt'))
    path = os.path.join(folder, fname)
    
    with open(path, 'w') as out_f:
        out_f.write(src)


In [None]:
FOLDER = 'recovered_notebooks'

In [None]:
save_jupyter_notebook(src23191647, notebook_id=808459, folder=FOLDER)

In [None]:
print(src23191647
     )