In [3]:
import pandas as pd
from os import getcwd, listdir, chdir
from os.path import splitext

chdir("../..")

In [5]:
data_path = getcwd() +  "/data/raw"

list_of_files = [file for file in listdir(data_path) if 'Onet' in file]

list_of_files

['Onet_JobZones.csv',
 'Onet_Knowledge.csv',
 'Onet_WorkActivities.csv',
 'Onet_WorkContext.csv']

In [6]:
# Read All the Files as CSVs into a Dictionary
data_dict = {splitext(key)[0].split("_")[1]: pd.read_csv(f"{data_path}/{key}") for key in list_of_files}

# Select DataFrames for Transposing
transpose_dict = {k:v for k,v in data_dict.items() if k in list(data_dict.keys())[1:]}

transpose_dict.keys() 
transpose_dict['WorkContext'].columns # Testing


# Creating a Function to apply Filter to All the Files
def clean_filter(df, scale_name_category):

    df = df[df['Scale_Name'] == scale_name_category]

    filtered_cols = ['SOC_Code', 'Title', 'Element_ID', 'Element_Name', 'Scale_Name', 'Data_Value']

    important_df = df[filtered_cols]

    return important_df


## Transpose from Pandas Transform

# Helper Function
def transform(sub_df):
    # Create a selection to perform a transpose
    transpose_test = sub_df[['Element_Name', 'Data_Value']].T
    transpose_test.columns = transpose_test.iloc[0]
    second_frame = transpose_test[1:].reset_index().drop('index', axis=1)

    # First Frame (Info that stays the same)
    code_title_frame = sub_df[['SOC_Code', 'Title']].iloc[0:1]
    first_frame = code_title_frame.reset_index().drop('index', axis=1)

    return pd.concat([first_frame, second_frame], axis=1)

## Wrap as function
def transpose_func(df, n):
    # Getting Number of Partitions
    partition_count = int(df.shape[0] / n)

    # Empty List to store partitions
    temp_df_partitions = []

    # Going through each partition
    for i in range(partition_count):
        # Getting a specific partition
        temp_df = df.iloc[i*n:(i+1)*n]
        # Applying transform to partition
        temp_df = transform(temp_df)
        # Adding transformed partition into partition list
        temp_df_partitions.append(temp_df)

    # Consolidating all partitions into single DataFrame
    final_df = pd.concat(temp_df_partitions)

    return final_df

# Output to CSV
output_path = getcwd() + "/data/sas_to_pandas/transformed"

# Applying Cleaning to Knowledge Wide
knowledge_2 = clean_filter(transpose_dict['Knowledge'], 'Importance')

knowledge_wide = transpose_func(knowledge_2, 33)

# knowledge_wide.to_csv(f"{output_path}/KNOWLEDGE_WIDE.csv", index=False)

# Applying Cleaning to Work Activities
work_activity_2 = clean_filter(transpose_dict['WorkActivities'], 'Importance')
work_activity_wide = transpose_func(work_activity_2, 41)
# work_activity_wide.to_csv(f"{output_path}/WORK_ACTIVITY_WIDE.csv", index=False)

# Applying Cleaning to Work Context
work_context_2 = clean_filter(transpose_dict['WorkContext'], 'Context')
work_context_wide = transpose_func(work_context_2, 57)
# work_context_wide.to_csv(f"{output_path}/WORK_CONTEXT_WIDE.csv", index=False)

# Cleaning Job Zone
cols_to_keep = data_dict['JobZones'].columns.tolist()[:3]
job_zones_2 = data_dict['JobZones'][cols_to_keep]
# job_zones_2.to_csv(f"{output_path}/Job_Zones_2.csv", index=False)

In [7]:
knowledge_wide.head()

Unnamed: 0,SOC_Code,Title,Administration_and_Management,Administrative,Economics and Accounting,Sales and Marketing,Customer and Personal Service,Personnel and Human Resources,Production and Processing,Food Production,...,English Language,Foreign Language,Fine Arts,History and Archeology,Philosophy and Theology,Public Safety and Security,Law and Government,Telecommunications,Communications and Media,Transportation
0,11-1011.00,Chief Executives,4.75,2.66,3.7,3.23,4.09,4.1,2.63,1.14,...,4.07,1.56,1.43,1.48,1.7,3.3,3.92,1.76,2.7,2.21
0,11-1011.03,Chief Sustainability Officers,4.15,2.62,3.19,3.15,3.41,3.0,2.19,2.07,...,4.3,1.85,1.63,2.31,2.11,2.56,3.69,1.89,3.56,3.11
0,11-1021.00,General and Operations Managers,4.35,3.51,3.47,3.47,3.95,3.76,3.39,1.34,...,3.71,1.62,1.16,1.21,1.51,3.1,2.95,2.5,2.59,2.2
0,11-2011.00,Advertising and Promotions Managers,4.12,3.25,3.04,4.68,4.16,2.6,2.58,1.04,...,4.41,1.58,2.35,1.67,1.89,2.47,2.23,2.4,4.35,1.95
0,11-2021.00,Marketing Managers,4.04,3.01,3.1,4.85,3.85,2.71,2.46,1.12,...,4.48,1.64,1.7,1.68,1.64,2.5,2.86,2.86,3.8,1.68


In [8]:
work_activity_wide.head()

Unnamed: 0,SOC_Code,Title,Getting Information,"Monitoring Processes, Materials, or Surroundings","Identifying Objects, Actions, and Events","Inspecting Equipment, Structures, or Materials","Estimating the Quantifiable Characteristics of Products, Events, or Information","Judging the Qualities of Objects, Services, or People",Processing Information,Evaluating Information to Determine Compliance with Standards,...,Performing for or Working Directly with the Public,Coordinating the Work and Activities of Others,Developing and Building Teams,Training and Teaching Others,"Guiding, Directing, and Motivating Subordinates",Coaching and Developing Others,Providing Consultation and Advice to Others,Performing Administrative Activities,Staffing Organizational Units,Monitoring and Controlling Resources
0,11-1011.00,Chief Executives,4.72,3.68,4.2,2.38,3.59,4.35,4.03,4.22,...,2.94,3.93,4.55,3.0,4.57,3.92,3.65,3.14,3.7,4.43
0,11-1011.03,Chief Sustainability Officers,4.78,3.48,3.85,2.11,3.54,3.85,3.96,3.67,...,3.56,4.07,4.19,3.63,3.78,3.93,3.93,3.3,3.26,3.88
0,11-1021.00,General and Operations Managers,4.26,4.07,4.14,3.43,3.58,4.06,3.96,4.01,...,3.85,4.24,3.98,3.94,4.15,3.93,3.17,3.49,3.46,3.81
0,11-2011.00,Advertising and Promotions Managers,4.32,2.78,3.6,1.77,2.79,3.37,3.51,2.35,...,3.18,3.07,2.96,2.74,2.98,2.62,2.31,3.11,2.29,2.52
0,11-2021.00,Marketing Managers,4.33,3.27,4.24,1.45,3.59,3.37,3.36,2.61,...,2.81,3.97,4.08,3.14,3.84,3.36,3.18,2.59,2.25,3.18


In [9]:
work_context_wide.head()

Unnamed: 0,SOC_Code,Title,Public Speaking,Telephone,Electronic Mail,Letters and Memos,Face-to-Face Discussions,Contact With Others,Work With Work Group or Team,Deal With External Customers,...,Freedom to Make Decisions,Degree of Automation,Importance of Being Exact or Accurate,Importance of Repeating Same Tasks,Structured versus Unstructured Work,Level of Competition,Time Pressure,Pace Determined by Speed of Equipment,Work Schedules,Duration of Typical Work Week
0,11-1011.00,Chief Executives,3.39,5.0,5.0,4.33,4.98,4.8,4.74,4.33,...,4.91,2.68,4.16,3.23,4.98,4.18,4.2,1.66,1.3,2.89
0,11-1011.03,Chief Sustainability Officers,2.93,4.74,5.0,3.44,4.67,4.41,4.78,3.85,...,4.37,1.56,3.52,1.81,4.44,3.7,3.48,1.19,1.08,2.74
0,11-1021.00,General and Operations Managers,2.87,4.93,4.85,3.95,4.95,4.79,4.79,4.13,...,4.78,2.27,3.99,3.59,4.8,3.44,4.22,1.6,1.15,2.86
0,11-2011.00,Advertising and Promotions Managers,2.55,4.8,5.0,3.78,4.87,4.68,4.47,4.19,...,4.04,2.01,4.3,2.7,4.23,3.34,4.4,1.0,1.2,2.48
0,11-2021.00,Marketing Managers,3.35,4.92,5.0,3.52,4.76,4.61,4.67,4.25,...,4.42,2.1,3.81,2.78,4.63,3.95,4.21,1.02,1.26,2.78


In [10]:
job_zones_2.head()

Unnamed: 0,SOC_Code,Title,Job_Zone
0,11-1011.00,Chief Executives,5
1,11-1011.03,Chief Sustainability Officers,5
2,11-1021.00,General and Operations Managers,4
3,11-1031.00,Legislators,4
4,11-2011.00,Advertising and Promotions Managers,4
