# How to initialize saved results on your system


For now just edit each explorer notebook with the path to your data.

# Usage

[Click here](#initialize-job-dict) to edit the settings for your run.

[Click here](#execution-and-output) to view run progress.

# Imports

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
import json
import os
import time
import shutil

In [3]:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor

# Settings for the Job

## Initialize Job Dict


In [4]:
job_settings_dict = {
    "job_name": "All-SES-Ind-Sept-23",
    "long_name": "All individual SES questions with the new preprocessed text and updated notebooks (now with links!)",
    "experiments_path": "Experiment-Files/all-SES-exp.json",
    # Change these if you want to initialize on your own.
    "data_filepath": "/home/azureuser/cloudfiles/code/Data/pp-20210830_SES_and_SET.csv",
    "output_directory": "/home/azureuser/cloudfiles/code/Jobs",
    "include_metadata": False,
    "metadata_path": "/home/azureuser/cloudfiles/code/Data/pp-20210830_SES_and_SET.csv",
    # Notebook directories
    "explorer_nb": "Interactive-LDA-Explorer.ipynb",
    "gridsearch_nb": "Gensim Asym Gridsearch.ipynb",
    # Dataset details
    "text_column": "Preprocessed answer",
    "nice_text_column": "answer",
    "index_column": "unique_comment_ID",
    "filter_column": "question_ID",
    # Stop words
    "stop_words": ["dr.","firstname","lastname","professor","instructor","teacher"],
    # Hyperparameter space
    "num_topics": [x for x in range(3,18,1)]
}

## Add computed values

In [5]:
job_settings_dict["job_directory"] = os.path.join(job_settings_dict["output_directory"],job_settings_dict["job_name"])

## Set up experiments

Each experiment can have different columns selected, or different hyperparameter sets for the gridsearch etc.


In [None]:
## An example notebook-level specification of experiments
# experiments = [
#     {
#         "job_name": "general-comments",
#         "acceptable_values": ["X840296"]
#     },
#     {
#         "job_name": "support-question",
#         "acceptable_values": ['X840316']
#     },
#     {
#         "job_name": "org-help",
#         "acceptable_values": ["X840324"]
#     }
# ]

In [6]:
with open(job_settings_dict["experiments_path"]) as exp_file:
    experiments = json.load(exp_file)

### Computed Experiment Values

In [7]:
for exp in experiments:
    # Specify path for each experiment
    exp["job_directory"] = os.path.join(job_settings_dict["job_directory"],exp["job_name"])
    # Make sure each experiment as a name (should really be unique)
    exp["job_name"]

### Add Experiments to Job


In [8]:
job_settings_dict["experiments"] = experiments[:]

## Display Job Dict for inspection


In [9]:
pd.DataFrame([job_settings_dict]).transpose()

Unnamed: 0,0
job_name,All-SES-Ind-Sept-23
long_name,All individual SES questions with the new prep...
experiments_path,Experiment-Files/all-SES-exp.json
data_filepath,/home/azureuser/cloudfiles/code/Data/pp-202108...
output_directory,/home/azureuser/cloudfiles/code/Jobs
include_metadata,False
metadata_path,/home/azureuser/cloudfiles/code/Data/pp-202108...
explorer_nb,Interactive-LDA-Explorer.ipynb
gridsearch_nb,Gensim Asym Gridsearch.ipynb
text_column,Preprocessed answer


# Job Execute

## Function Definitions


In [10]:
def prepare_job(job_config = job_settings_dict, top_level_job = False):
    job_dir = job_config["job_directory"]
    # Create the job directory
    os.makedirs(job_dir, exist_ok = True)
    # Dump the job configuration
    with open(os.path.join(job_dir,"job_config.json"),"w") as file:
        json.dump(job_config, file, indent=4)
    # Dump the explorer notebook template
    if top_level_job:
        shutil.copyfile(
            src = job_config["explorer_nb"],
            dst = os.path.join(job_dir,"Explorer.ipynb")
            )

In [11]:
def do_gridsearch(job_config = job_settings_dict):
    job_dir = job_config['job_directory']
    gridsearch_template_nb = "Gensim Asym Gridsearch.ipynb"
    output_nb = "Gridsearch.ipynb"
    with open(gridsearch_template_nb,"r") as nb_file:
        nb = nbformat.read(nb_file,as_version= 4) #Read gridsearch template notebook
        ep = ExecutePreprocessor(timeout = 24*60*60) #Execute gridsearch notebook with 1 day cell timeout
        ep.preprocess(nb,{'metadata': {'path': job_dir}}) #Actually execute notebook, specifying the correct directory
        with open(os.path.join(job_dir,output_nb),"w") as output_file:
            nbformat.write(nb,output_file) #Write completed gridsearch

In [12]:
def run_experiments(update_settings_only = False):
    prepare_job(top_level_job = True)
    experiments = job_settings_dict["experiments"]
    num_experiments = len(experiments)
    current_exp = 0
    for exp in experiments:
        current_exp += 1
        # Set up new config file for experiment with its base being job_settings
        exp_config = job_settings_dict.copy()
        # Remove "experiments" section from the new job config
        del exp_config["experiments"]
        # Replace default job config with variables from experiment
        exp_config.update(exp)
        # Run job setup on the experiment
        print(f"Running Setup for: {exp['job_name']}, {current_exp} of {num_experiments}")
        prepare_job(exp_config)
        # Skip fitting if setup parameter is true
        if update_settings_only: continue
        # Run gridsearch on the experiment and save it in the experiment directory
        print(f"Performing analysis for: {exp['job_name']}")
        print(time.ctime(),"\n")
        do_gridsearch(exp_config)
    print("Job Done")
    print(time.ctime())

## Execution and Output


In [13]:
run_experiments(
    #update_settings_only = True
    )

Running Setup for: X840307, 1 of 35
Performing analysis for: X840307
Thu Sep 23 21:18:05 2021 

Running Setup for: X840321, 2 of 35
Performing analysis for: X840321
Thu Sep 23 21:19:25 2021 

Running Setup for: X840298, 3 of 35
Performing analysis for: X840298
Thu Sep 23 21:20:28 2021 

Running Setup for: X840319, 4 of 35
Performing analysis for: X840319
Thu Sep 23 21:21:19 2021 

Running Setup for: X840296, 5 of 35
Performing analysis for: X840296
Thu Sep 23 21:22:28 2021 

Running Setup for: X840297, 6 of 35
Performing analysis for: X840297
Thu Sep 23 21:23:53 2021 

Running Setup for: X840302, 7 of 35
Performing analysis for: X840302
Thu Sep 23 21:25:31 2021 

Running Setup for: X840304, 8 of 35
Performing analysis for: X840304
Thu Sep 23 21:26:57 2021 

Running Setup for: X840316, 9 of 35
Performing analysis for: X840316
Thu Sep 23 21:27:54 2021 

Running Setup for: X840312, 10 of 35
Performing analysis for: X840312
Thu Sep 23 21:29:10 2021 

Running Setup for: X840305, 11 of 35
Pe