In [19]:
%pip install s3fs
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [20]:
import sagemaker
import pandas as pd
import boto3
import io
import os
import torch
import pickle

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [21]:
BUCKET_NAME = "job-skill-s3"
DATA_DIR = "raw_dataset"
CSV_FILE = "job_title_des.csv"

In [22]:
sess = sagemaker.Session()
bucket_name = sess.default_bucket()

s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

## Reading Data from S3

Copy s3 file to local directory

__NOTE__: Make sure you have access to read object from s3, otherwise create and attach policy according to your reqruiement

In [23]:
job_df = pd.read_csv(f"s3://{os.path.join(BUCKET_NAME, DATA_DIR, CSV_FILE)}").drop(columns=["Unnamed: 0"])

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [24]:
job_df.head()

Unnamed: 0,Job Title,Job Description
0,Flutter Developer,We are looking for hire experts flutter develo...
1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,Full Stack Developer,job responsibility full stack engineer – react...


Target variable distributions

In [25]:
job_df['Job Title'].value_counts()

JavaScript Developer      166
Java Developer            161
Software Engineer         160
Node js developer         160
iOS Developer             159
PHP Developer             156
Flutter Developer         155
DevOps Engineer           155
Django Developer          152
Machine Learning          152
Backend Developer         147
Network Administrator     145
Database Administrator    139
Full Stack Developer      138
Wordpress Developer       132
Name: Job Title, dtype: int64

Create target variable

In [26]:
job_df['Target_cat'] = job_df['Job Title'].astype('category')
job_df['Target'] = job_df['Target_cat'].cat.codes

In [27]:
category_map = {code: category for code, category in enumerate(job_df['Target_cat'].cat.categories)}
category_map

{0: 'Backend Developer',
 1: 'Database Administrator',
 2: 'DevOps Engineer',
 3: 'Django Developer',
 4: 'Flutter Developer',
 5: 'Full Stack Developer',
 6: 'Java Developer',
 7: 'JavaScript Developer',
 8: 'Machine Learning',
 9: 'Network Administrator',
 10: 'Node js developer',
 11: 'PHP Developer',
 12: 'Software Engineer',
 13: 'Wordpress Developer',
 14: 'iOS Developer'}

## Save Categories for later

In [28]:
## category map
with open(r"./job_category.pickle", "wb") as output_file:
    pickle.dump(category_map, output_file)
    
    
# with open('./category_map.pickle', 'rb') as fp:
#     print(pickle.load(fp))

It's better not to split data randomly, especially with imbalanced datasets with many labels, as it can lead to train, test, and validation sets having different classes.

In [29]:
# Function to split data for each category
def split_data(group):
    train, temp = train_test_split(group, test_size=0.2, random_state=42)  # 80% train, 20% temp
    val, test = train_test_split(temp, test_size=0.5, random_state=42)     # 50% of temp -> 10% val, 10% test
    return train, val, test


In [30]:
# Initialize empty dataframes to store results
train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

# Apply split for each category
for _, group in job_df.groupby('Target_cat'):
    train, val, test = split_data(group)
    train_df = pd.concat([train_df, train])
    val_df = pd.concat([val_df, val])
    test_df = pd.concat([test_df, test])

# Display the size of each split
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Train size: 1816
Validation size: 227
Test size: 234


Convert Dataframe to Hugging face Dataset

In [31]:
def return_hf_dataset(train_df, val_df, test_df):
    df_train = train_df.copy()
    df_val = val_df.copy()
    df_test = test_df.copy()
    print("[INFO] Train, Test, and Val set shape", df_train.shape, df_test.shape, df_val.shape)
    
    dataset_train = Dataset.from_pandas(df_train.drop('Target_cat', axis=1).reset_index())
    dataset_val = Dataset.from_pandas(df_val.drop('Target_cat', axis=1).reset_index())
    dataset_test = Dataset.from_pandas(df_test.drop('Target_cat', axis=1).reset_index())
    
    # Combine them into a single DatasetDict                                                              
    dataset = DatasetDict({
        'train': dataset_train,
        'val': dataset_val,
        'test': dataset_test
    })
    return dataset 
    

In [32]:
dataset = return_hf_dataset(train_df, val_df, test_df)

[INFO] Train, Test, and Val set shape (1816, 4) (234, 4) (227, 4)


In [33]:
dataset['train']

Dataset({
    features: ['index', 'Job Title', 'Job Description', 'Target'],
    num_rows: 1816
})

In [30]:
dataset.save_to_disk(f"s3://{BUCKET_NAME}/dataset/")

Saving the dataset (0/1 shards):   0%|          | 0/1816 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/227 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/234 [00:00<?, ? examples/s]