In [18]:
import numpy as np
import pandas as pd
import sys
import functools
from time import time

def get_summary(pipeline_func):
    @functools.wraps(pipeline_func)
    def wrapper(*args, **kwargs):
        assert isinstance(args[0], pd.core.frame.DataFrame)
        result_df = pipeline_func(*args, **kwargs)
        assert isinstance(result_df, pd.core.frame.DataFrame)
        print(result_df.shape)

        return result_df

    return wrapper

def get_data(filepath):
    print(f"reading from {filepath}")
    try:
        df = pd.read_csv(filepath)
        return df
    except:
        print("couldn't read data")
        return pd.DataFrame()

def pipeline(df, fns, name):
    intermediate = df
    print(f"RUNNING {name}")
    for i, fn in enumerate(fns):
        print(f"STEP {i+1}: {str(fn)}")
        intermediate = fn(intermediate)
    print("[DONE] ------------\n")
    return intermediate

@get_summary
def initial(df):
    print(df.keys())
    return df

@get_summary
def top2k(df):
    return df[:2000]

@get_summary
def drop_columns(df):
    result = df[["category", "job_description", "is_fulltime"]]
    return result

@get_summary
def create_binary_job(df):
    df.loc[:, "is_fulltime"] = df.job_type == 'Full Time'
    return df

@get_summary
def task1_fields(df):
    return df[["job_description", "is_fulltime"]]

def get_category_proportions(df):
    print(df.category.value_counts() / sum(df.category.value_counts()))
    return df

@get_summary
def stratified_undersample_fulltime(df):
    other_count = len(df) - sum(df.is_fulltime)
    assert other_count == sum(~df.is_fulltime)
    
    all_indices = np.arange(len(df))
    fulltime_indices = all_indices[df.is_fulltime]
    other_indices = all_indices[~df.is_fulltime]
    
    # we need to select other_count indices out of the fulltime_indices list without replacement
    
    category_counts = other_count * (df.category.value_counts() / sum(df.category.value_counts()))
    category_counts = category_counts.astype(int)
    resulting_indices = []
    for category in df.category.unique():
        in_category_and_fulltime = all_indices[(df.is_fulltime) & (df.category == category)]
        choices = np.random.choice(in_category_and_fulltime, category_counts[category])
        resulting_indices.extend(choices)
        
    new_df = df.loc[[*resulting_indices, *other_indices]]
    return new_df
    
def naive_undersample_fulltime(df):
    other_count = len(df) - sum(df.is_fulltime)
    all_indices = np.arange(len(df))
    fulltime_indices = all_indices[df.is_fulltime]
    other_indices = all_indices[~df.is_fulltime]
    new_df = df.loc[[*np.random.choice(fulltime_indices, other_count), *other_indices]] 
    
    return new_df
    
@get_summary
def task2_fields(df):
    return df[["job_description", "category"]]

def display_balance(df):
    fulltime_prop = sum(df.is_fulltime) / len(df)
    print(f"FULL TIME %: {round(fulltime_prop * 100, 2)}%")
    print(f"OTHER %: {round((1 - fulltime_prop) * 100, 2)}%")
    return df

# Preprocessing Pipeline


In [17]:
df = get_data('../data/seek_australia.csv')
result = pipeline(df, [initial, top2k, create_binary_job, drop_columns], "Initial pipeline")
task1_result = pipeline(result, [display_balance, stratified_undersample_fulltime, display_balance,task1_fields], "Task 1 pipeline")
task2_result = pipeline(result, [task2_fields], "Task 2 pipeline")

reading from ../data/seek_australia.csv
RUNNING Initial pipeline
STEP 1: <function initial at 0x7f68d7525820>
Index(['category', 'city', 'company_name', 'geo', 'job_board',
       'job_description', 'job_title', 'job_type', 'post_date',
       'salary_offered', 'state', 'url'],
      dtype='object')
(30000, 12)
STEP 2: <function top2k at 0x7f68d619faf0>
(2000, 12)
STEP 3: <function create_binary_job at 0x7f68d619fee0>
(2000, 13)
STEP 4: <function drop_columns at 0x7f68d619f9d0>
(2000, 3)
[DONE] ------------

RUNNING Task 1 pipeline
STEP 1: <function display_balance at 0x7f68d8b5b700>
FULL TIME %: 69.2%
OTHER %: 30.8%
STEP 2: <function stratified_undersample_fulltime at 0x7f68d8b5bd30>
(1215, 3)
STEP 3: <function display_balance at 0x7f68d8b5b700>
FULL TIME %: 49.3%
OTHER %: 50.7%
STEP 4: <function task1_fields at 0x7f68d619fdc0>
(1215, 2)
[DONE] ------------

RUNNING Task 2 pipeline
STEP 1: <function task2_fields at 0x7f68d8b5b820>
(2000, 2)
[DONE] ------------



# Dealing with Data Imbalance

## Undersampling
In our case, some classes appear more frequently than others. (e.g. there are more full time jobs than any other kind). We want to try to ensure an equal balance between our classes. Since our dataset is relatively large, we can afford to undersample, this is the process where we intentionally remove samples from our larger class, reducing the overall size of our dataset.
Naively undersampling the `is_fulltime` field will result in a biased dataset. This is because we are ignoring our knowledge of the various job categories. By naively undersampling we are probably going to change the proportion of the different job categories, e.g. we just happen to (by bad luck) disproporitonately remove a bunch of law firm jobs. This will hence skew the `job_description` data. In an attempt to mitigate this issue, we will attempt to keep the category proporitions the same when we undersample from the `is_fulltime` column. Hence we will perform a **stratified** sample of the data where the strata are the various categories.